library(data.table) # Data table Tools
library(summarytools) # Summary Tools
library(ggplot2) # Plot Visualization Tool
library(data.table) # fread
library(rbokeh) # visualization of data
library(summarytools) # ORIGINALSummary
library(adabag) # bagging and boosting
library(caret) # pre-processing
library(dplyr) # select, mutate_if
library(fastDummies) # dummy_cols
library(splitTools) # data partition
library(rpart) # classification tree
library(rpart.plot) # plot regression trees
library(DT) # datatable
library(corrplot) # corrplot
library(gains) # gain
library(randomForest) # randomForest
library(cluster) # hierarchical clustering
library(knitr) # kable
library(kableExtra) # kbl
library(MASS) # lda, qda, etc.
library(dplyr) # Data Wrangling Tools
library(klaR) # partimat
library(forecast)
library(pROC)
library(tibble)
library(mda) # mda
library(RColorBrewer) # Color Palette
library(tidyverse) # useful Dataframe tools
library(glmnet) # Logistic Lasso Regression
library(car) # VIF 
library(ROCR) # ROC Curve
library(neuralnet) # Neural Network
library(nnet) # Neural network 
library(factoextra) # K-Means Clustering
library(ggpubr) # ggplot addons
library(GGally) # pairs plots
library (naniar)
# Confusion matrix
draw_confusion_matrix <- function(cm, titleaddon = '') {
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title(paste0('CONFUSION MATRIX', ' ', titleaddon), cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(5, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(5, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(23, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(23, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(41, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(41, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(59, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(59, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(77, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(77, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  text(95, 85, names(cm$byClass[8]), cex=1.2, font=2)
  text(95, 70, round(as.numeric(cm$byClass[8]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}  

Data Analysis

ORIGINAL <- fread("data/Breast_Cancer/breast-cancer.csv")

Structure of Data

# printing sumamry
print(dfSummary(ORIGINAL, valid.col = FALSE, graph.magnif = 0.75, plain.ascii = FALSE, html = TRUE, style ='grid', silent = TRUE), max.tbl.height = 300, width = 80, method = "render")

Data Frame Summary

ORIGINAL

Dimensions: 569 x 32
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 id [integer]
Mean (sd) : 30371831 (125020586)
min ≤ med ≤ max:
8670 ≤ 906024 ≤ 911320502
IQR (CV) : 7943911 (4.1)
569 distinct values 0 (0.0%)
2 diagnosis [character]
1. B
2. M
357(62.7%)
212(37.3%)
0 (0.0%)
3 radius_mean [numeric]
Mean (sd) : 14.1 (3.5)
min ≤ med ≤ max:
7 ≤ 13.4 ≤ 28.1
IQR (CV) : 4.1 (0.2)
456 distinct values 0 (0.0%)
4 texture_mean [numeric]
Mean (sd) : 19.3 (4.3)
min ≤ med ≤ max:
9.7 ≤ 18.8 ≤ 39.3
IQR (CV) : 5.6 (0.2)
479 distinct values 0 (0.0%)
5 perimeter_mean [numeric]
Mean (sd) : 92 (24.3)
min ≤ med ≤ max:
43.8 ≤ 86.2 ≤ 188.5
IQR (CV) : 28.9 (0.3)
522 distinct values 0 (0.0%)
6 area_mean [numeric]
Mean (sd) : 654.9 (351.9)
min ≤ med ≤ max:
143.5 ≤ 551.1 ≤ 2501
IQR (CV) : 362.4 (0.5)
539 distinct values 0 (0.0%)
7 smoothness_mean [numeric]
Mean (sd) : 0.1 (0)
min ≤ med ≤ max:
0.1 ≤ 0.1 ≤ 0.2
IQR (CV) : 0 (0.1)
474 distinct values 0 (0.0%)
8 compactness_mean [numeric]
Mean (sd) : 0.1 (0.1)
min ≤ med ≤ max:
0 ≤ 0.1 ≤ 0.3
IQR (CV) : 0.1 (0.5)
537 distinct values 0 (0.0%)
9 concavity_mean [numeric]
Mean (sd) : 0.1 (0.1)
min ≤ med ≤ max:
0 ≤ 0.1 ≤ 0.4
IQR (CV) : 0.1 (0.9)
537 distinct values 0 (0.0%)
10 concave points_mean [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0.2
IQR (CV) : 0.1 (0.8)
542 distinct values 0 (0.0%)
11 symmetry_mean [numeric]
Mean (sd) : 0.2 (0)
min ≤ med ≤ max:
0.1 ≤ 0.2 ≤ 0.3
IQR (CV) : 0 (0.2)
432 distinct values 0 (0.0%)
12 fractal_dimension_mean [numeric]
Mean (sd) : 0.1 (0)
min ≤ med ≤ max:
0 ≤ 0.1 ≤ 0.1
IQR (CV) : 0 (0.1)
499 distinct values 0 (0.0%)
13 radius_se [numeric]
Mean (sd) : 0.4 (0.3)
min ≤ med ≤ max:
0.1 ≤ 0.3 ≤ 2.9
IQR (CV) : 0.2 (0.7)
540 distinct values 0 (0.0%)
14 texture_se [numeric]
Mean (sd) : 1.2 (0.6)
min ≤ med ≤ max:
0.4 ≤ 1.1 ≤ 4.9
IQR (CV) : 0.6 (0.5)
519 distinct values 0 (0.0%)
15 perimeter_se [numeric]
Mean (sd) : 2.9 (2)
min ≤ med ≤ max:
0.8 ≤ 2.3 ≤ 22
IQR (CV) : 1.8 (0.7)
533 distinct values 0 (0.0%)
16 area_se [numeric]
Mean (sd) : 40.3 (45.5)
min ≤ med ≤ max:
6.8 ≤ 24.5 ≤ 542.2
IQR (CV) : 27.3 (1.1)
528 distinct values 0 (0.0%)
17 smoothness_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0
IQR (CV) : 0 (0.4)
547 distinct values 0 (0.0%)
18 compactness_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0.1
IQR (CV) : 0 (0.7)
541 distinct values 0 (0.0%)
19 concavity_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0.4
IQR (CV) : 0 (0.9)
533 distinct values 0 (0.0%)
20 concave points_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0.1
IQR (CV) : 0 (0.5)
507 distinct values 0 (0.0%)
21 symmetry_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0.1
IQR (CV) : 0 (0.4)
498 distinct values 0 (0.0%)
22 fractal_dimension_se [numeric]
Mean (sd) : 0 (0)
min ≤ med ≤ max:
0 ≤ 0 ≤ 0
IQR (CV) : 0 (0.7)
545 distinct values 0 (0.0%)
23 radius_worst [numeric]
Mean (sd) : 16.3 (4.8)
min ≤ med ≤ max:
7.9 ≤ 15 ≤ 36
IQR (CV) : 5.8 (0.3)
457 distinct values 0 (0.0%)
24 texture_worst [numeric]
Mean (sd) : 25.7 (6.1)
min ≤ med ≤ max:
12 ≤ 25.4 ≤ 49.5
IQR (CV) : 8.6 (0.2)
511 distinct values 0 (0.0%)
25 perimeter_worst [numeric]
Mean (sd) : 107.3 (33.6)
min ≤ med ≤ max:
50.4 ≤ 97.7 ≤ 251.2
IQR (CV) : 41.3 (0.3)
514 distinct values 0 (0.0%)
26 area_worst [numeric]
Mean (sd) : 880.6 (569.4)
min ≤ med ≤ max:
185.2 ≤ 686.5 ≤ 4254
IQR (CV) : 568.7 (0.6)
544 distinct values 0 (0.0%)
27 smoothness_worst [numeric]
Mean (sd) : 0.1 (0)
min ≤ med ≤ max:
0.1 ≤ 0.1 ≤ 0.2
IQR (CV) : 0 (0.2)
411 distinct values 0 (0.0%)
28 compactness_worst [numeric]
Mean (sd) : 0.3 (0.2)
min ≤ med ≤ max:
0 ≤ 0.2 ≤ 1.1
IQR (CV) : 0.2 (0.6)
529 distinct values 0 (0.0%)
29 concavity_worst [numeric]
Mean (sd) : 0.3 (0.2)
min ≤ med ≤ max:
0 ≤ 0.2 ≤ 1.3
IQR (CV) : 0.3 (0.8)
539 distinct values 0 (0.0%)
30 concave points_worst [numeric]
Mean (sd) : 0.1 (0.1)
min ≤ med ≤ max:
0 ≤ 0.1 ≤ 0.3
IQR (CV) : 0.1 (0.6)
492 distinct values 0 (0.0%)
31 symmetry_worst [numeric]
Mean (sd) : 0.3 (0.1)
min ≤ med ≤ max:
0.2 ≤ 0.3 ≤ 0.7
IQR (CV) : 0.1 (0.2)
500 distinct values 0 (0.0%)
32 fractal_dimension_worst [numeric]
Mean (sd) : 0.1 (0)
min ≤ med ≤ max:
0.1 ≤ 0.1 ≤ 0.2
IQR (CV) : 0 (0.2)
535 distinct values 0 (0.0%)

Generated by summarytools 1.0.1 (R version 4.2.1)
2022-12-17

Missing values

gg_miss_var(ORIGINAL) + ggtitle("NAs")
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## i Please use "none" instead.
## i The deprecated feature was likely used in the naniar package.
##   Please report the issue at <]8;;https://github.com/njtierney/naniar/issueshttps://github.com/njtierney/naniar/issues]8;;>.

Distribution of Data

norm.value <- preProcess(ORIGINAL, method = c("center", "scale"))
ORIGINAL.boxplot <- predict(norm.value, ORIGINAL)
ORIGINAL.boxplot <-  melt(dplyr::select(ORIGINAL.boxplot, -c(id)))
## Warning in melt.data.table(dplyr::select(ORIGINAL.boxplot, -c(id))): id.vars
## and measure.vars are internally guessed when both are 'NULL'. All non-numeric/
## integer/logical type columns are considered id.vars, which in this case are
## columns [diagnosis, ...]. Consider providing at least one of 'id' or 'measure'
## vars in future.
library(ggplot2)
ggplot(ORIGINAL.boxplot, aes(x = diagnosis, y = value)) +
  facet_wrap(~variable) +
  stat_boxplot(geom ='errorbar') +
  geom_boxplot()

# histogram over all columns
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value)) + 
    geom_histogram(bins = 10) + 
    facet_wrap(~key, scales = 'free')

# histogram over all columns grouped by diagnosis
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value, fill = factor(diagnosis))) + 
  geom_histogram(aes(y = ..density..), alpha = 0.6, position = "identity") + 
  facet_wrap(~key, scales = 'free') + 
  ggtitle("Histogram of predictors seperated by class") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(fill=guide_legend(title="Diagnosis"))+
  scale_fill_discrete(labels=c('benign', 'malignant'))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlation

# Color Palette
library("RColorBrewer")

# select numeric variables
Corr_Data <- na.omit(ORIGINAL[,-c(1,2)])
Corr_plot <- cor(Corr_Data)

# Correlations plotting
corrplot(Corr_plot, method = "color", col=brewer.pal(n=8, name="BuGn"),tl.col="black",tl.srt=45, addCoef.col = "black",number.cex = 1)

Pairs plots

predictors means plots:

ggpairs(ORIGINAL, columns = 3:12, aes(color = diagnosis, alpha = 0.5),
        diag = list(continuous = "blankDiag"),  title = "Pairs plot for Means of Predictors")+ 
   theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axis

predictors SE plots:

ggpairs(ORIGINAL, columns = 13:22, aes(color = diagnosis, alpha = 0.5),
        diag = list(continuous = "blankDiag"),  title = "Pairs plot for SE of Predictors")+ 
   theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axis

predictors “worst” plots:

ggpairs(ORIGINAL, columns = 23:32, aes(color = diagnosis, alpha = 0.5),
        diag = list(continuous = "blankDiag"), title = "Pairs plot for Worsts of Predictors")+ 
   theme(axis.text.x = element_text(angle = 90),axis.title.y.right = element_text(size = 0.4)) # readable x axis

Data Preparation

Transformation

# Transform Character Format to Binary Numerical Values on Outcome Variable "Diagnosis"

ORIGINAL[diagnosis == "M", c("diagnosis")] <- 1  # 1 for Malign Outcome
ORIGINAL[diagnosis == "B", c("diagnosis")] <- 0  # 0 for Benign Outcome 

ORIGINAL$diagnosis <- as.factor(ORIGINAL$diagnosis) # To Factor Variable 

Standardization

ORIGINAL ORIGINAL center scale ORIGINAL range

Partitioning

We partition the data into Training (50%), Validation (30%) and Test (20%)

set.seed(1)

# Splitting each Set from the ORIGINAL Dataset
splitting <- sample(1:3,size=nrow(ORIGINAL),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ORIGINAL[splitting==1,]
Validation <- ORIGINAL[splitting==2,]
Test <- ORIGINAL[splitting==3,]

# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test)/nrow(ORIGINAL))*100

# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")
## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and  19.33 % In Test"

Supervised Learning

Logistic Regression

Assumptions for Logistic Regression:

  1. The dependent variable must be categorical in nature.
  2. The independent variable should not have multi-collinearity.

Type of Logistic Regression:

  1. Binomial (we will use this type since we only have binary outcomes, malign or benign)
  2. Multinational
  3. Ordinal

Fit the Logistic Regression Model

set.seed(1)

# Duplicate the Training and Validation Set
Training_Logistic <- Training
Validation_Logistic <- Validation

# Remove the "ID" Variable
Training_Logistic <- Training_Logistic[,-c("id")]
Validation_Logistic <- Validation_Logistic[,-c("id")]
set.seed(1)

# Fit The Logistic Regression Model
Logistic_Model_1 <- glm(diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + compactness_se + concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se, family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_1)
## 
## Call:
## glm(formula = diagnosis ~ radius_mean + texture_mean + perimeter_mean + 
##     area_mean + smoothness_mean + compactness_mean + concavity_mean + 
##     `concave points_mean` + symmetry_mean + fractal_dimension_mean + 
##     radius_se + texture_se + perimeter_se + area_se + smoothness_se + 
##     compactness_se + concavity_se + `concave points_se` + symmetry_se + 
##     fractal_dimension_se, family = binomial(link = "logit"), 
##     data = Training_Logistic)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.317   0.000   0.000   0.000   2.136  
## 
## Coefficients:
##                            Estimate   Std. Error z value Pr(>|z|)
## (Intercept)              -360.60177    545.86789  -0.661    0.509
## radius_mean               123.35635    193.50517   0.637    0.524
## texture_mean                5.41398      7.08341   0.764    0.445
## perimeter_mean            -20.56072     29.99195  -0.686    0.493
## area_mean                   0.08655      0.18712   0.463    0.644
## smoothness_mean         -2341.41264   3334.91457  -0.702    0.483
## compactness_mean          379.21208    569.93197   0.665    0.506
## concavity_mean            943.41790   1350.82135   0.698    0.485
## `concave points_mean`    1504.32982   2010.64280   0.748    0.454
## symmetry_mean             773.95029   1002.93720   0.772    0.440
## fractal_dimension_mean   5851.30425   8045.54675   0.727    0.467
## radius_se                 -76.50918    230.85247  -0.331    0.740
## texture_se                -34.99463     45.23880  -0.774    0.439
## perimeter_se               14.70440     27.64997   0.532    0.595
## area_se                     2.37526      3.70090   0.642    0.521
## smoothness_se           -4663.05813   6377.73774  -0.731    0.465
## compactness_se          -2567.06626   3467.70359  -0.740    0.459
## concavity_se             -204.44834    667.66965  -0.306    0.759
## `concave points_se`      1708.06481   2562.64576   0.667    0.505
## symmetry_se             -1087.68610   1385.67389  -0.785    0.432
## fractal_dimension_se   -18817.19739  25354.48076  -0.742    0.458
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187  on 299  degrees of freedom
## Residual deviance:  20.026  on 279  degrees of freedom
## AIC: 62.026
## 
## Number of Fisher Scoring iterations: 18

Comments: Trying to fit every variable in our logistic regression showed an convergence error when there is complete separation. To deal with this we should use a penalized model because of too many variables included in our model compared to the number of observations. (See [Convergence Error in Logistic Regression] and [Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net] in References)

Fit a Penalized Model for Logistic Regression

There is 3 differents methods when it comes to Penalized Logistic Regression Model:

  1. ridge regression: variables with minor contribution have their coefficients close to zero. However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.
  2. lasso regression: the coefficients of some less contributive variables are forced to be exactly zero. Only the most significant variables are kept in the final model.
  3. elastic net regression: the combination of ridge and lasso regression. It shrinks some coefficients toward zero (like ridge regression) and set some coefficients to exactly zero (like lasso regression)

For this case, we will use a Lasso Regression Model being way more strict in attributing less to no weight to variables not significant enough.

# Required Packages
library(tidyverse)
library(caret)
library(glmnet)

# Setting Seed
set.seed(1)

# Define response variable
y_lasso <- as.numeric(Training_Logistic$diagnosis)

# Define matrix of predictor variables
x_lasso <- data.matrix(Training_Logistic[,-c("diagnosis")])

# Perform k-fold cross-validation to find optimal lambda value - alpha = 1 is for using Lasso Method
cv_model <- cv.glmnet(x_lasso, y_lasso, alpha = 1)

# Find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
print(paste("Best Lambda is equal to",best_lambda))
## [1] "Best Lambda is equal to 0.000578270146672675"
# Produce plot of test MSE by lambda value
plot(cv_model) 

Comments: We want to use the lowest MSE and thus find the optimal Lambda.

set.seed(1)

# Use optimal lambda value and alpha = 1 is for using Lasso Method
Logistic_Lasso_Optimal <- glmnet(x_lasso, y_lasso, alpha = 1, lambda = best_lambda)

# Disable Scientific Notation
options(scipen=999)

# Model Summary
Logistic_Lasso_Optimal$beta
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                    s0
## radius_mean              .           
## texture_mean             0.0092460114
## perimeter_mean           0.0007441971
## area_mean                .           
## smoothness_mean         -0.6726642078
## compactness_mean        -3.0655109153
## concavity_mean           2.3768884384
## concave points_mean      .           
## symmetry_mean            0.6146556627
## fractal_dimension_mean  -5.5409126922
## radius_se                0.3947352904
## texture_se               0.0125006149
## perimeter_se             0.0175743608
## area_se                 -0.0026599944
## smoothness_se            2.4042915125
## compactness_se          -2.1473061382
## concavity_se            -3.0067441349
## concave points_se        8.7579605809
## symmetry_se              6.9831540153
## fractal_dimension_se    -7.8748193831
## radius_worst             0.0873539874
## texture_worst            .           
## perimeter_worst          0.0010958466
## area_worst              -0.0004789178
## smoothness_worst         2.8169401453
## compactness_worst        0.2083513325
## concavity_worst          0.0294152220
## concave points_worst     1.3314761934
## symmetry_worst           .           
## fractal_dimension_worst  5.2951497825

Comments: We can see that our Logistic Model has shrunk some variables to 0, this can be expected when using Lasso Regression, since it will get rid of unsignificant variables completely instead of setting a very low coefficient.

Logistic Regression using Variables from Lasso Selection

set.seed(1)

# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After <- glm(diagnosis ~  area_mean + smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se + radius_worst + texture_worst + area_worst + concavity_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: l'algoritmo non converge
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_After)
## 
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean + 
##     concavity_mean + `concave points_mean` + symmetry_mean + 
##     fractal_dimension_mean + radius_se + texture_se + perimeter_se + 
##     area_se + smoothness_se + concavity_se + `concave points_se` + 
##     symmetry_se + fractal_dimension_se + radius_worst + texture_worst + 
##     area_worst + concavity_worst + `concave points_worst` + symmetry_worst + 
##     fractal_dimension_worst, family = binomial(link = "logit"), 
##     data = Training_Logistic)
## 
## Deviance Residuals: 
##          Min            1Q        Median            3Q           Max  
## -0.000091587  -0.000000021  -0.000000021   0.000000021   0.000095973  
## 
## Coefficients:
##                              Estimate    Std. Error z value Pr(>|z|)
## (Intercept)                -1036.5304  2846587.3158   0.000    1.000
## area_mean                     -0.1696     1462.2833   0.000    1.000
## smoothness_mean             1450.2601  4192046.3648   0.000    1.000
## compactness_mean           -1731.9712  1906426.5985  -0.001    0.999
## concavity_mean              1469.0815  3711329.7662   0.000    1.000
## `concave points_mean`       1070.0724  4578141.0180   0.000    1.000
## symmetry_mean               -165.2926  2286527.5901   0.000    1.000
## fractal_dimension_mean      -437.0810 14827275.0943   0.000    1.000
## radius_se                     -7.1344   886865.7546   0.000    1.000
## texture_se                   -26.0697   118646.6300   0.000    1.000
## perimeter_se                  17.0820    55798.5434   0.000    1.000
## area_se                        1.1568     9851.6304   0.000    1.000
## smoothness_se             -10007.8391 23596748.0400   0.000    1.000
## concavity_se               -2628.1095  7354098.2654   0.000    1.000
## `concave points_se`        25297.8713 16023563.8416   0.002    0.999
## symmetry_se                 -931.2103  8047236.1089   0.000    1.000
## fractal_dimension_se      -54837.6628 30037108.6303  -0.002    0.999
## radius_worst                  19.9328   175061.2117   0.000    1.000
## texture_worst                  6.0563     5777.3810   0.001    0.999
## area_worst                     0.1061     2460.1674   0.000    1.000
## concavity_worst              112.5699   552215.1283   0.000    1.000
## `concave points_worst`     -1639.6759  2321009.5161  -0.001    0.999
## symmetry_worst               260.3450   802972.6305   0.000    1.000
## fractal_dimension_worst     6959.5789  5101143.4606   0.001    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187196953955  on 299  degrees of freedom
## Residual deviance:   0.000000089786  on 276  degrees of freedom
## AIC: 48
## 
## Number of Fisher Scoring iterations: 25

Comments: Eventhough we did a Lasso Regression, we can see that our standard Logistic Regression fails to converge with this selection of variables. This is could be due to the number of variables, 23 in our case and the low number of observations. Now it’s the time to check our VIF, since Multicolinearity could be our main source of problem.

VIF

We need to compute multiple stage when removing our multicolinear Variables, let’s see when we don’t have anymore problem of multicolinearity.

Test For Multicolinearity in Our Dataset using VIF - First Iteration

set.seed(1)

# Load the car library
library(car)

# Create vector of VIF values
vif_values <- vif(Logistic_Model_After)

# Create horizontal bar chart to display each VIF value
barplot(vif_values, main = "VIF Values - First Iteration", horiz = FALSE, col = "steelblue", las=2)

# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)

# Call VIF Values
vif_values
##               area_mean         smoothness_mean        compactness_mean 
##              12711.7372                474.1363                658.4854 
##          concavity_mean   `concave points_mean`           symmetry_mean 
##               5543.8846               1826.2882                387.8132 
##  fractal_dimension_mean               radius_se              texture_se 
##                836.5596               3823.2095                637.7963 
##            perimeter_se                 area_se           smoothness_se 
##                603.9070               9076.0715                736.7649 
##            concavity_se     `concave points_se`             symmetry_se 
##               2048.6454                395.2401                291.1494 
##    fractal_dimension_se            radius_worst           texture_worst 
##                236.8928              23366.1249                144.7868 
##              area_worst         concavity_worst  `concave points_worst` 
##              52809.4217                611.4157                691.8691 
##          symmetry_worst fractal_dimension_worst 
##                210.7082                519.5269

Comments: We can see that most of our variables have multicolinearity (with VIF over 5). We need to remove variables with the highest VIF first. We can start by removing at least 8 variables: radius_worst, area_worst, concavity_worst, concavity_mean, concavity_worst, fractal_dimension_se, concavity_se and concave points_mean.

Logistic Regression with VIF removing 8 variables - Second Iteration

set.seed(1)

# Fit The Logistic Regression Model
Logistic_Model_After_VIF1 <- glm(diagnosis ~  area_mean + smoothness_mean + compactness_mean + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + `concave points_se` + symmetry_se + texture_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_After_VIF1)
## 
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean + 
##     symmetry_mean + fractal_dimension_mean + radius_se + texture_se + 
##     perimeter_se + area_se + smoothness_se + `concave points_se` + 
##     symmetry_se + texture_worst + `concave points_worst` + symmetry_worst + 
##     fractal_dimension_worst, family = binomial(link = "logit"), 
##     data = Training_Logistic)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.66215  -0.02679  -0.00130   0.00007   2.74659  
## 
## Coefficients:
##                            Estimate  Std. Error z value Pr(>|z|)  
## (Intercept)              -28.101214   21.470757  -1.309   0.1906  
## area_mean                  0.005558    0.012446   0.447   0.6552  
## smoothness_mean           95.448428   87.774297   1.087   0.2768  
## compactness_mean         -11.799465   43.355986  -0.272   0.7855  
## symmetry_mean            -50.291312   44.623905  -1.127   0.2597  
## fractal_dimension_mean  -170.084912  282.240446  -0.603   0.5468  
## radius_se                 16.946697   41.812049   0.405   0.6853  
## texture_se                -0.751951    2.176595  -0.345   0.7297  
## perimeter_se              -0.005054    1.670765  -0.003   0.9976  
## area_se                    0.054588    0.459390   0.119   0.9054  
## smoothness_se            145.423653  413.481380   0.352   0.7251  
## `concave points_se`     -111.604244  340.577154  -0.328   0.7431  
## symmetry_se              -86.243828  166.526520  -0.518   0.6045  
## texture_worst              0.412596    0.222259   1.856   0.0634 .
## `concave points_worst`    96.771896   44.386229   2.180   0.0292 *
## symmetry_worst            31.814364   23.021997   1.382   0.1670  
## fractal_dimension_worst  -11.018661  114.043706  -0.097   0.9230  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187  on 299  degrees of freedom
## Residual deviance:  31.902  on 283  degrees of freedom
## AIC: 65.902
## 
## Number of Fisher Scoring iterations: 11

Comments: Even after removing 8 variables, our model still suffer from Convergence error, let’s check our VIF again.

Test For Multicolinearity in Our Dataset using VIF - Second Iteration

set.seed(1)

# Load the car library
library(car)

# Create vector of VIF values
vif_values_2 <- vif(Logistic_Model_After_VIF1)

# Create horizontal bar chart to display each VIF value
barplot(vif_values_2, main = "VIF Values - Second Iteration", horiz = FALSE, col = "steelblue", las=2)

# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)

# Call VIF Values
vif_values_2
##               area_mean         smoothness_mean        compactness_mean 
##               15.595581                6.145892               10.148517 
##           symmetry_mean  fractal_dimension_mean               radius_se 
##                5.027208               13.100655              153.870701 
##              texture_se            perimeter_se                 area_se 
##                5.769376                6.997751              151.905108 
##           smoothness_se     `concave points_se`             symmetry_se 
##                4.069325               12.270014                8.053918 
##           texture_worst  `concave points_worst`          symmetry_worst 
##                6.419859                7.861339               10.357040 
## fractal_dimension_worst 
##               11.174660

Comments: We can still see high VIF values in our variables, let’s remove 6 variables again: radius_se, perimeter_se, area_se, compactness_mean, texture_se and texture_worst.

Logistic Regression with VIF removing 6 variables - Third Iteration

set.seed(1)

# Fit The Logistic Regression Model
Logistic_Model_After_VIF2 <- glm(diagnosis ~  area_mean + smoothness_mean + symmetry_mean + fractal_dimension_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_After_VIF2)
## 
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean + 
##     fractal_dimension_mean + smoothness_se + `concave points_se` + 
##     symmetry_se + `concave points_worst` + symmetry_worst + fractal_dimension_worst, 
##     family = binomial(link = "logit"), data = Training_Logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6820  -0.0722  -0.0158   0.0093   4.3901  
## 
## Coefficients:
##                            Estimate  Std. Error z value Pr(>|z|)    
## (Intercept)              -14.896503    7.552690  -1.972 0.048570 *  
## area_mean                  0.012886    0.003772   3.416 0.000634 ***
## smoothness_mean           48.131843   55.833555   0.862 0.388655    
## symmetry_mean             -7.054654   30.331490  -0.233 0.816084    
## fractal_dimension_mean  -261.372561  186.237770  -1.403 0.160487    
## smoothness_se            514.036777  280.214593   1.834 0.066589 .  
## `concave points_se`     -139.375305  216.104775  -0.645 0.518964    
## symmetry_se               30.505299  117.585842   0.259 0.795303    
## `concave points_worst`    59.599022   23.907022   2.493 0.012669 *  
## symmetry_worst            12.023580   16.496592   0.729 0.466092    
## fractal_dimension_worst   64.714978   62.539298   1.035 0.300767    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187  on 299  degrees of freedom
## Residual deviance:  51.151  on 289  degrees of freedom
## AIC: 73.151
## 
## Number of Fisher Scoring iterations: 9

Comments: Now we can see that our Regression Model Converge, let’s compute a third iteration of VIF to check the multicolinearity.

Test For Multicolinearity in Our Dataset using VIF - Third Iteration

set.seed(1)

# Load the car library
library(car)

# Create vector of VIF values
vif_values_3 <- vif(Logistic_Model_After_VIF2)

# Create horizontal bar chart to display each VIF value
barplot(vif_values_3, main = "VIF Values - Third Iteration", horiz = FALSE, col = "steelblue", las=2)

# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)

# Call VIF Values
vif_values_3
##               area_mean         smoothness_mean           symmetry_mean 
##                2.036432                3.309360                3.734544 
##  fractal_dimension_mean           smoothness_se     `concave points_se` 
##                8.371764                2.751976                5.868334 
##             symmetry_se  `concave points_worst`          symmetry_worst 
##                5.369938                3.926850                7.396930 
## fractal_dimension_worst 
##                6.413996

Comments: We are indeed improved our VIF model by excluding a lot of multicolinear variables, we can still see 3 variables suffering from a VIF higher than 5, let’s remove fractal_dimension_mean and see if it improves everything.

Logistic Regression with VIF removing 1 variable - Fourth Iteration

set.seed(1)

# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF3 <- glm(diagnosis ~  area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_After_VIF3)
## 
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean + 
##     smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` + 
##     symmetry_worst + fractal_dimension_worst, family = binomial(link = "logit"), 
##     data = Training_Logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5703  -0.0904  -0.0212   0.0091   4.2589  
## 
## Coefficients:
##                            Estimate  Std. Error z value  Pr(>|z|)    
## (Intercept)              -21.569221    5.766024  -3.741  0.000183 ***
## area_mean                  0.014251    0.003528   4.039 0.0000536 ***
## smoothness_mean            2.822523   42.528827   0.066  0.947085    
## symmetry_mean             -8.237752   28.629721  -0.288  0.773550    
## smoothness_se            544.620641  260.927403   2.087  0.036866 *  
## `concave points_se`     -184.081058  189.735324  -0.970  0.331947    
## symmetry_se               -2.199196  104.630291  -0.021  0.983231    
## `concave points_worst`    64.859936   22.683560   2.859  0.004245 ** 
## symmetry_worst            12.776311   14.687096   0.870  0.384355    
## fractal_dimension_worst   -2.007970   38.811877  -0.052  0.958739    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187  on 299  degrees of freedom
## Residual deviance:  53.299  on 290  degrees of freedom
## AIC: 73.299
## 
## Number of Fisher Scoring iterations: 9

Comments: Now we can see that our Regression Model Converge, let’s compute a fourth iteration of VIF to check the multicolinearity.

Test For Multicolinearity in Our Dataset using VIF - Fourth Iteration

set.seed(1)

# Load the car library
library(car)

# Create vector of VIF values
vif_values_4 <- vif(Logistic_Model_After_VIF3)

# Create horizontal bar chart to display each VIF value
barplot(vif_values_4, main = "VIF Values - Fourth Iteration", horiz = FALSE, col = "steelblue", las=2)

# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)

# Call VIF Values
vif_values_4
##               area_mean         smoothness_mean           symmetry_mean 
##                1.974465                2.170228                3.393934 
##           smoothness_se     `concave points_se`             symmetry_se 
##                2.506280                5.034108                4.507909 
##  `concave points_worst`          symmetry_worst fractal_dimension_worst 
##                3.656527                6.429923                2.915146

Comments: We can see that our highest VIF values come from symmetry_worst, we can remove it and check if our model is now free of multicolinearity issues.

Logistic Regression with VIF removing 1 variable - Fifth Iteration (Last)

set.seed(1)

# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF4 <- glm(diagnosis ~  area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
## Warning: glm.fit: si sono verificate probabilità stimate numericamente pari a 0
## o 1
# Disable Scientific Notation
options(scipen=999)

# Model Summary
summary(Logistic_Model_After_VIF4)
## 
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean + 
##     smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` + 
##     fractal_dimension_worst, family = binomial(link = "logit"), 
##     data = Training_Logistic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6582  -0.0909  -0.0251   0.0086   4.2428  
## 
## Coefficients:
##                            Estimate  Std. Error z value  Pr(>|z|)    
## (Intercept)              -21.031832    5.780075  -3.639  0.000274 ***
## area_mean                  0.014467    0.003604   4.014 0.0000597 ***
## smoothness_mean           -6.948246   39.846117  -0.174  0.861569    
## symmetry_mean              6.581404   23.217996   0.283  0.776823    
## smoothness_se            483.403351  217.848490   2.219  0.026487 *  
## `concave points_se`     -273.861042  131.451779  -2.083  0.037219 *  
## symmetry_se               61.479884   66.461799   0.925  0.354945    
## `concave points_worst`    71.119350   21.538479   3.302  0.000960 ***
## fractal_dimension_worst    8.358018   35.734350   0.234  0.815068    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.187  on 299  degrees of freedom
## Residual deviance:  54.198  on 291  degrees of freedom
## AIC: 72.198
## 
## Number of Fisher Scoring iterations: 9

Comments: We improved some significance level removing this high VIF variable.

Test For Multicolinearity in Our Dataset using VIF - Fifth Iteration (Last)

set.seed(1)

# Load the car library
library(car)

# Create vector of VIF values
vif_values_5<- vif(Logistic_Model_After_VIF4)

# Create horizontal bar chart to display each VIF value
barplot(vif_values_5, main = "VIF Values - Fifth Iteration", horiz = FALSE, col = "steelblue", las=2)

# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)

# Call VIF Values
vif_values_5
##               area_mean         smoothness_mean           symmetry_mean 
##                2.054457                1.987622                2.246336 
##           smoothness_se     `concave points_se`             symmetry_se 
##                2.286389                2.614686                2.244157 
##  `concave points_worst` fractal_dimension_worst 
##                3.857056                2.524969

Comments: Now we can see that all our selected variables are not subject to multicolinearity anymore. Let’s use this model to compute some predictions.

Predictions

Logistic Lasso Regression - Predictions and Confusion Matrix on Validation

set.seed(1)

# Predictions with LR
Logistic_Lasso_Predictions <- predict(Logistic_Model_After_VIF4, Validation_Logistic[,c("area_mean", "smoothness_mean", "symmetry_mean", "smoothness_se", "concave points_se", "symmetry_se", "concave points_worst", "fractal_dimension_worst")], type = "response")

# Rounding Predictions - 0.5 Threshold
Logistic_Lasso_Predictions_Dummy <- round(Logistic_Lasso_Predictions)

# As Numeric
Logistic_Lasso_Predictions_Dummy <- as.numeric(Logistic_Lasso_Predictions_Dummy)

# Check rounding in a Dataframe
DF_Logistic_Lasso_Predictions <- cbind(Logistic_Lasso_Predictions, Logistic_Lasso_Predictions_Dummy)

# As Factor
Logistic_Lasso_Predictions_Dummy <- as.factor(Logistic_Lasso_Predictions_Dummy)

# Confusion Matrix
Confusion_Matrix_Logistic_Lasso <- confusionMatrix(data = Logistic_Lasso_Predictions_Dummy, reference = Validation_Logistic$diagnosis, positive = "1")

# Create the Function for Confusion Matrix
draw_confusion_matrix_Logistic_Lasso <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Logistic Regression - Validation', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Logistic_Lasso(Confusion_Matrix_Logistic_Lasso)

Comments:

ROC Curve of the Logistic Regression - Validation

set.seed(1)

# Load ROCR Package
library(ROCR)

# Plot our ROC Curve
pr <- ROCR::prediction(Logistic_Lasso_Predictions, Validation_Logistic$diagnosis)
prf <- ROCR::performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf, main="ROC for Validation Set")
abline(a = 0, b = 1) 

Comments:

Best Logistic Regression

# Confusion Best
Best_Logistic_Confusion <- Confusion_Matrix_Logistic_Lasso

# Predictions Best
Best_Logistic_Predictions_Dummy <- Logistic_Lasso_Predictions_Dummy
Best_Logistic_Predictions_Dummy <- factor(Best_Logistic_Predictions_Dummy)

Best_Logistic_Predictions_Probabilities <- Logistic_Lasso_Predictions

# Best Predictions as Data frame
DF_Best_Logistic_Predictions <- data.frame(Best_Logistic_Predictions_Dummy, Best_Logistic_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME

# Best Confusion as Data frame
DF_Best_Logistic_Confusion <- data.frame(c(Best_Logistic_Confusion$byClass[c(1,2)], Best_Logistic_Confusion$overall[1]))
colnames(DF_Best_Logistic_Confusion) <- "Best Logistic Lasso Regression"
DF_Best_Logistic_Confusion <- t(DF_Best_Logistic_Confusion) # FINAL CONFUSION DATAFRAME

Classification Tree

#Re-name the partitions in the data
Training_M <- Training
Validation_M <- Validation
Test_M <- Test

#Take out the id column
Training_M <- data.frame(Training_M[,-c(1)])
Validation_M <- data.frame(Validation_M[,-c(1)])
Test_M<- data.frame(Test_M[, -c(1)])


# Checking if proportions are right
Prop_Training <- (nrow(Training_M)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation_M)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test_M)/nrow(ORIGINAL))*100

# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")
## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and  19.33 % In Test"

The outcome variable is a binary factor, we model a classification tree. We first run a deep tree, with all the features included. Then proceed to reduce the size of the deeper tree through pruning.

run tree:

set.seed(1)
options(scipen=999)

tree_full <- rpart(diagnosis ~ ., 
              data = Training_M, 
              method = "class",  # "class" because Y is a binary factor
              minbucket = 1,
              cp = 0.00001) 

# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)

length(tree_full$frame$var[tree_full$frame$var == "<leaf>"]) # End nodes
## [1] 9
relevance<-as.data.frame(tree_full$variable.importance) #we get the ranking of the variables by importance
kable(relevance, row.names = T,col.names="Variable Importance")%>% kable_paper("hover", full_width = T) #built table
Variable Importance
perimeter_worst 107.1041853
radius_worst 104.4460423
area_worst 103.3104820
radius_mean 96.4545551
perimeter_mean 94.4386002
area_mean 93.1159458
concave.points_worst 16.8085026
compactness_worst 7.2225217
symmetry_worst 7.2225217
concave.points_mean 6.9755085
concavity_worst 5.7780173
texture_mean 5.2906178
concavity_mean 5.1375986
texture_worst 3.9679634
fractal_dimension_mean 2.6453089
fractal_dimension_worst 2.6453089
smoothness_mean 1.9784946
texture_se 1.4970760
compactness_mean 1.0338243
concave.points_se 0.8040856
smoothness_se 0.7485380
compactness_se 0.5743468
printcp(tree_full, digits = 6) # print complexity value
## 
## Classification tree:
## rpart(formula = diagnosis ~ ., data = Training_M, method = "class", 
##     minbucket = 1, cp = 0.00001)
## 
## Variables actually used in tree construction:
## [1] concave.points_mean  concave.points_worst perimeter_worst     
## [4] radius_mean          radius_worst         smoothness_mean     
## [7] texture_mean        
## 
## Root node error: 109/300 = 0.363333
## 
## n= 300 
## 
##           CP nsplit  rel error   xerror      xstd
## 1 0.83486239      0 1.00000000 1.000000 0.0764263
## 2 0.08256881      1 0.16513761 0.266055 0.0469566
## 3 0.01834862      2 0.08256881 0.174312 0.0387028
## 4 0.00917431      4 0.04587156 0.165138 0.0377375
## 5 0.00001000      8 0.00917431 0.192661 0.0405438
plotcp(tree_full, upper = "splits") # we plot the progression of complexity values

#Prune the tree
min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"]  # find the corresponding CP value, to get the "best pruned " tree


pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(pruned_tree, yesno = TRUE, digits =-3)

length(pruned_tree$frame$var[pruned_tree$frame$var == "<leaf>"]) # how many end nodes
## [1] 5

The fully grown tree is quite reduced in size, we still pruned the tree.

Performance of Best Pruned Tree

# classification prediction over validation data
pruned_pred <- predict(pruned_tree, Validation_M, type = "class")
pruned_prob <- predict(pruned_tree, Validation_M, type = "prob") # probabilities of belonging to 1


# confusion matrix and accuracy of classification tree
tree_cf<- confusionMatrix(pruned_pred, Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(tree_cf)

Sensitivity lower than Specificity ( Malign diagnosis is minority). Accuracy pretty high.

#ROC curve
ROC_df <- data.frame(Validation_M[,1], pruned_pred)
ROC_df[,1]<- as.numeric(as.character(ROC_df[,1]))
ROC_df$pruned_pred<- as.numeric(as.character(ROC_df$pruned_pred))
roc_score <- roc(data= ROC_df , response=Validation_M...1., pruned_pred) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")

K-Nearest Neighbor

As K nearest neighbour is based on distances the predictors need to be standardized such that they have a mean at 0 and a variance equal to 1. This necessity of standardizinh is due to the fact that otherwise variables with bigger values have more influence on the distance that is being calculated.

# partition
ORIGINAL.KNN.train <- dplyr::select(Training, -c(id))
ORIGINAL.KNN.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.KNN.test <- dplyr::select(Test, -c(id))

# standardize
norm.value <- preProcess(ORIGINAL.KNN.train, method = c("center", "scale"))
ORIGINAL.KNN.train <- predict(norm.value, ORIGINAL.KNN.train)
ORIGINAL.KNN.valid <- predict(norm.value, ORIGINAL.KNN.valid)
ORIGINAL.KNN.test <- predict(norm.value, ORIGINAL.KNN.test)

To get the best k one might iterate over severall k and choose the one which has the highest value for the metric that is being looked at. In this case we looked at either accuracy or sensitivity.

set.seed(1)

accuracy.df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30))
sensitivity.df <- data.frame(k = seq(1, 30, 1), sensitivtiy = rep(0, 30))


# iterating over different ks
for(i in 1:30){
  # nearest neighbor
  KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = i)

  # predictions response 
  KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
  
  # predictions prob 
  KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
  
  # Confusionmatrix
  sensitivity.df[i, 2] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$byClass[1]
  accuracy.df[i, 2] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$overall[1]

}

# table in markdown
datatable(accuracy.df)
datatable(sensitivity.df)
# plot the ks 
ggplot(accuracy.df) +
 aes(x = k, y = accuracy) +
 geom_line(size = 0.7, colour = "#112646") +
 labs(x = "Number of k nearest neighbours", 
 y = "Accuracy", title = "Accuracy regarding k") +
 theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.

ggplot(sensitivity.df) +
 aes(x = k, y = sensitivtiy) +
 geom_line(size = 0.7, colour = "#112646") +
 labs(x = "Number of k nearest neighbours", 
 y = "Sensitivtiy", title = "Sensitivity regarding k") +
 theme_minimal()

From the output we can see that the best k is either a 6 or 1. For accuracy the best k is at 6 while for sensitivity it is at 1 with the second best being at 6. Down below the code for KNN with k=6 and k=1.

set.seed(1)
# nearest neighbor
KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 6)

# predictions response 
KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")

# predictions prob 
KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]

# Confusionmatrix Validation
KNN1.conf.mat <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN1.conf.mat, titleaddon = "KNN with k=6")

When looking at the output we see that 6 nearest neighbors doesn’t deliver a bad model but it lacks sensitivity which is very important in the case of classifying cancer.

Here we see the model with 1 nearest neighbor.

set.seed(1)
# nearest neighbor
KNN2 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 1)

# predictions response 
KNN2.pred.valid.resp <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")

# predictions prob 
KNN2.pred.valid.prob <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]

# Confusionmatrix Validation
KNN2.conf.mat <- confusionMatrix(KNN2.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN2.conf.mat, titleaddon = 'KNN')

We see that this model with 1 nearest neighbor is overall a little better than the model with k equal to 6 and is better in sensitivity. But we will not continue with this model as it isn’t much better than the one with k equal to 6 and in KNN only one neighbor implies overfitting. As we do not want to run this risk we continue with k equal to 6.

Down below we print out again the best model.

# Confusion Best
Best_KNN_Confusion <- KNN1.conf.mat

# Predictions Best
Best_KNN_Predictions_Dummy <- KNN1.pred.valid.resp
Best_KNN_Predictions_Dummy <- factor(Best_KNN_Predictions_Dummy)

Best_KNN_Predictions_Probabilities <- KNN1.pred.valid.prob


# Best Predictions as Data frame
DF_Best_KNN_Predictions <- data.frame(Best_KNN_Predictions_Dummy, Best_KNN_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME

# Best Confusion as Data frame
DF_Best_KNN_Confusion <- data.frame(c(Best_KNN_Confusion$byClass[c(1,2)], Best_KNN_Confusion$overall[1]))
colnames(DF_Best_KNN_Confusion) <- "Best KNN"
DF_Best_KNN_Confusion <- t(DF_Best_KNN_Confusion) # FINAL CONFUSION DATAFRAME

Neural Networks

set.seed(1)

# Duplicate the Training and Validation Set
Training_NN <- Training
Validation_NN <- Validation
Test_NN <- Test

# Make Sure to be as Dataframe
Training_NN <- data.frame(Training_NN)
Validation_NN <- data.frame(Validation_NN)
Test_NN <- data.frame(Test_NN)

# Remove the "ID" Variable
Training_NN <- Training_NN[,-1]
Validation_NN <- Validation_NN[,-1]
Test_NN <- Test_NN[,-1]

# Preprocess Data
Norm_NN <- preProcess(Training_NN, method = c("center", "scale"))
Training_NN_Preprocess <- predict(Norm_NN, Training_NN)
Validation_NN_Preprocess <- predict(Norm_NN, Validation_NN)
Test_NN_Preprocess <- predict(Norm_NN, Test_NN)

Basic Neural Network with 3 hidden nodes - Neural Model 1

set.seed(1)

# Load library
library(neuralnet)
library(nnet)

# Fit neural network with 3 hidden layers
Neural_1 <- neuralnet(diagnosis ~ ., data = Training_NN_Preprocess, hidden=3, linear.output = FALSE)

# Plot Neural Network Model 1
(plot(Neural_1))
## NULL

Predictions and Confusion Matrix - Neural Model 1

Confusion Matrix with Validation

set.seed(1)

# Predictions
Predictions_NN1 <- predict(Neural_1, Validation_NN_Preprocess, type="response")
Predictions_NN1_Probabilities <- Predictions_NN1[,2]

# Rounding Predictions - 0.5 Threshold
Predictions_NN1_Dummy <- round(Predictions_NN1_Probabilities)

# As Numeric
Predictions_NN1_Dummy <- as.numeric(Predictions_NN1_Dummy)

# Check rounding in a Dataframe
DF_Neural_Predictions <- cbind(Predictions_NN1_Probabilities, Predictions_NN1_Dummy)

# As Factor
Predictions_NN1_Dummy <- factor(Predictions_NN1_Dummy)
Validation_NN_Preprocess$diagnosis <- factor(Validation_NN_Preprocess$diagnosis)

# Confusion Matrix
Confusion_Matrix_Neural_1 <- confusionMatrix(data = Predictions_NN1_Dummy, reference = Validation_NN_Preprocess$diagnosis, positive = "1")

# Create the Function for Confusion Matrix
draw_confusion_matrix_Neural_1 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Neural Network - Model 1', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Neural_1(Confusion_Matrix_Neural_1)

Comments: Lowest Accuracy on Validation and Specificity.

Advanced Neural Network with 1 Hidden Layers of 15 Hidden Nodes - Model 2

Some rules of thumb can help in deciding the structure of our Neural Network Model, here are some from Stackoverflow and the book Introduction to Neural Networks for Java (second edition) by Jeff Heaton:

For Hidden Neurons: “1. The number of hidden neurons should be between the size of the input layer and the size of the output layer. 2. The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer. 3. The number of hidden neurons should be less than twice the size of the input layer.”

Since we have 30 input nodes (30 features/variables) and 2 output nodes (2 Classes with variable diagnosis), 15 hidden neurons sound like the number to go.

For Hidden Layers: “Problems that require two hidden layers are rarely encountered. However, neural networks with two hidden layers can represent functions with any kind of shape. There is currently no theoretical reason to use neural networks with any more than two hidden layers. In fact, for many practical problems, there is no reason to use any more than one hidden layer.” Thus we can use 1 layers and later see if 2 layers still be relevant compared to our validation and test set.

set.seed(1)

# Load library
library(neuralnet)
library(nnet)
library(devtools)
## Warning: il pacchetto 'devtools' è stato creato con R versione 4.2.2
## Caricamento del pacchetto richiesto: usethis
# Fit neural network with 15 hidden nodes
Neural_2 <- neuralnet(diagnosis ~ ., data = Training_NN_Preprocess, hidden=15, linear.output = FALSE)

# Plot Neural Network Model 2
(plot(Neural_2))
## NULL

Predictions and Confusion Matrix - Neural Model 2

Confusion Matrix with Validation
set.seed(1)

# Predictions 
Predictions_NN2 <- predict(Neural_2, Validation_NN_Preprocess, type="response")
Predictions_NN2_Probabilities <- Predictions_NN2[,2]

# Rounding Predictions - 0.5 Threshold 
Predictions_NN2_Dummy <- round(Predictions_NN2_Probabilities)

# As Numeric
Predictions_NN2_Dummy <- as.numeric(Predictions_NN2_Dummy)

# Check rounding in a Dataframe
DF_Neural_Predictions_2 <- cbind(Predictions_NN2_Probabilities, Predictions_NN2_Dummy)

# As Factor
Predictions_NN2_Dummy <- factor(Predictions_NN2_Dummy)
Validation_NN_Preprocess$diagnosis <- factor(Validation_NN_Preprocess$diagnosis)

# Confusion Matrix
Confusion_Matrix_Neural_2 <- confusionMatrix(data = Predictions_NN2_Dummy, reference = Validation_NN_Preprocess$diagnosis, positive = "1")

# Create the Function for Confusion Matrix
draw_confusion_matrix_Neural_2 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Neural Network - Model 2', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Neural_2(Confusion_Matrix_Neural_2)

Comments: BEST MODEL both Sensitivity and Accuracy

ROC Curve for Neural Model 2

On Validation
set.seed(1)

# Load ROCR Package
library(ROCR)

# Plot our ROC Curve
pr2_NN <- ROCR::prediction(Predictions_NN2_Probabilities, Validation_NN_Preprocess$diagnosis)
prf2_NN <- ROCR::performance(pr2_NN, measure = "tpr", x.measure = "fpr")
plot(prf2_NN, main="ROC for Validation Set")
abline(a = 0, b = 1) 

Advanced Neural Network with 2 Hidden Layers of 15 Hidden Nodes - Model 3

set.seed(1)

# Load library
library(neuralnet)
library(nnet)
library(devtools)

# Fit neural network with 15 hidden nodes
Neural_3 <- neuralnet(diagnosis ~ ., data = Training_NN_Preprocess, hidden=c(15,15), linear.output = FALSE)

# Plot Neural Network Model 2
(plot(Neural_3))
## NULL

Predictions and Confusion Matrix - Neural Model 3

Confusion Matrix For Validation
set.seed(1)

# Predictions 
Predictions_NN3 <- predict(Neural_3, Validation_NN_Preprocess, type="response")
Predictions_NN3_Probabilities <- Predictions_NN3[,2]

# Rounding Predictions - 0.5 Threshold 
Predictions_NN3_Dummy <- round(Predictions_NN3_Probabilities)

# As Numeric
Predictions_NN3_Dummy <- as.numeric(Predictions_NN3_Dummy)

# Check rounding in a Dataframe
DF_Neural_Predictions_3 <- cbind(Predictions_NN3_Probabilities, Predictions_NN3_Dummy)

# As Factor
Predictions_NN3_Dummy <- factor(Predictions_NN3_Dummy)
Validation_NN_Preprocess$diagnosis <- factor(Validation_NN_Preprocess$diagnosis)

# Confusion Matrix
Confusion_Matrix_Neural_3 <- confusionMatrix(data = Predictions_NN3_Dummy, reference = Validation_NN_Preprocess$diagnosis, positive = "1")

# Create the Function for Confusion Matrix
draw_confusion_matrix_Neural_3 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Neural Network - Model 3', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Neural_3(Confusion_Matrix_Neural_3)

Comments: NOT BETTER

Neural Network Best Model

# Confusion Best
Best_Neural_Network_Confusion <- Confusion_Matrix_Neural_2

# Predictions Best
Best_Neural_Network_Predictions_Dummy <- Predictions_NN2_Dummy
Best_Neural_Network_Predictions_Dummy <- factor(Best_Neural_Network_Predictions_Dummy)

Best_Neural_Network_Predictions_Probabilities <- Predictions_NN2_Probabilities

# Best Predictions as Data frame
DF_Best_Neural_Network_Predictions <- data.frame(Best_Neural_Network_Predictions_Dummy, Best_Neural_Network_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME

# Best Confusion as Data frame
DF_Best_Neural_Confusion <- data.frame(c(Best_Neural_Network_Confusion$byClass[c(1,2)], Best_Neural_Network_Confusion$overall[1]))
colnames(DF_Best_Neural_Confusion) <- "Best Neural Network"
DF_Best_Neural_Confusion <- t(DF_Best_Neural_Confusion) # FINAL CONFUSION DATAFRAME

Discriminant Analysis

To run discriminant analysis the data needs to be centered and scaled. As again otherwise larger values might have abigger influence.

set.seed(1)
                                   
# partition
ORIGINAL.DA.train <- dplyr::select(Training, -c(id))
ORIGINAL.DA.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.DA.test <- dplyr::select(Test, -c(id))

# standardize
norm.value <- preProcess(ORIGINAL.DA.train, method = c("center", "scale"))
ORIGINAL.DA.train <- predict(norm.value, ORIGINAL.DA.train)
ORIGINAL.DA.valid <- predict(norm.value, ORIGINAL.DA.valid)
ORIGINAL.DA.test <- predict(norm.value, ORIGINAL.DA.test)

First we run a linear discriminant analysis.

set.seed(1)

# Fit the model
DA1 <- lda(diagnosis~., data = ORIGINAL.DA.train)

# Make predictions
predictions <- predict(DA1, ORIGINAL.DA.valid)

# predictions prob
DA1.pred.valid.prob <- predictions$posterior[,2]

# predictions response 
DA1.pred.valid.resp <- factor(predictions$class)

# confusion matrix
DA1.conf.mat <- confusionMatrix(DA1.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.conf.mat, titleaddon = 'Discriminant Analysis')

# Evaluating LDA
# not run because it plots a lot of graphs
# partimat(diagnosis~., data = data.frame(ORIGINAL.DA.train), method="lda", mar=c(0.5, 0.5, 0.5, 0.5))

Not a bad model but again it lacks sensitivity. As the model is quite big we can try a variable selection.

set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "lda", direction = "backward", data = data.frame(ORIGINAL.DA.train))
##  `stepwise classification', using 10-fold cross-validated correctness rate of method lda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667;  starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.96333;  out: "radius_worst";  variables (29): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##        0.00        0.00        7.52
DA1.sel <- lda(diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean + 
    smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + 
    symmetry_mean + fractal_dimension_mean + radius_se + texture_se + 
    perimeter_se + area_se + smoothness_se + compactness_se + 
    concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se + 
    texture_worst + perimeter_worst + area_worst + smoothness_worst + 
    compactness_worst + concavity_worst + `concave points_worst` + 
    symmetry_worst + fractal_dimension_worst, data = ORIGINAL.DA.train)

# Make predictions
predictions.sel <- predict(DA1.sel, ORIGINAL.DA.valid)

# predictions prob
DA1.sel.pred.valid.prob <- predictions.sel$posterior[,2]

# predictions response 
DA1.sel.pred.valid.resp <- factor(predictions.sel$class)

# confusion matrix
DA1.sel.conf.mat <- confusionMatrix(DA1.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.sel.conf.mat, titleaddon = 'Discriminant Analysis')

We see that the model only drops one variable and the predictive power of the model doesn’t change much as instead of 1 false positive and 7 false negatives there are now 0 false positives and 8 false negatives.

As we have seen in the data anaylsis there is some correlation in the data why we can try to run a quadratic discriminant analysis.

set.seed(1)

# Fit the model
DA2 <- qda(diagnosis ~., data = ORIGINAL.DA.train)

# Make predictions
predictions <- predict(DA2, ORIGINAL.DA.valid)

# predictions prob
DA2.pred.valid.prob <- predictions$posterior[,2]

# predictions response 
DA2.pred.valid.resp <- factor(predictions$class)

# confusion matrix
DA2.conf.mat <- confusionMatrix(DA2.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')

Overall this model is worse than the lda but i case of sensitivtiy it is better as there are only 4 false negatives.

set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "qda", direction = "backward", data = data.frame(ORIGINAL.DA.train))
##  `stepwise classification', using 10-fold cross-validated correctness rate of method qda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667;  starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.95333;  out: "area_mean";  variables (29): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.96333;  out: "concavity_worst";  variables (28): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.96667;  out: "perimeter_worst";  variables (27): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.97;  out: "perimeter_mean";  variables (26): radius_mean, texture_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.97667;  out: "smoothness_mean";  variables (25): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## correctness rate: 0.98;  out: "smoothness_se";  variables (24): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##        0.00        0.00       19.39
DA2.sel <- qda(diagnosis ~ radius_mean + texture_mean + compactness_mean + concavity_mean + 
    `concave points_mean` + symmetry_mean + fractal_dimension_mean + 
    radius_se + texture_se + perimeter_se + area_se + compactness_se + 
    concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se + 
    radius_worst + texture_worst + area_worst + smoothness_worst + 
    compactness_worst + `concave points_worst` + symmetry_worst + 
    fractal_dimension_worst, data = ORIGINAL.DA.train)

# Make predictions
predictions <- predict(DA2.sel, ORIGINAL.DA.valid)

# predictions prob
DA2.sel.pred.valid.prob <- predictions$posterior[,2]

# predictions response 
DA2.sel.pred.valid.resp <- factor(predictions$class)

# confusion matrix
DA2.sel.conf.mat <- confusionMatrix(DA2.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.sel.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')

This time around 6 variables were dropped but we see that the model is not better than the qda with all predictors.

There are several more discriminant analysis methods that can be applied. Down below we tried Mixture discriminant analysis (MDA) which often outperforms QDA and LDA because the assumptions for the distributions of the classes are loser than for lda and qda.

set.seed(1)

# Fit the model
DA4 <- mda(diagnosis~., data = ORIGINAL.DA.train)

# predictions prob
DA4.pred.valid.prob <- predict(DA4, ORIGINAL.DA.valid, type = "posterior")[,2]

# predictions response 
DA4.pred.valid.resp <- factor(ifelse(DA4.pred.valid.prob > 0.5, 1, 0))

# confusion matrix
DA4.conf.mat <- confusionMatrix(DA4.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA4.conf.mat, 'Mixture discriminant analysis')

We see that this method in general works better than the lda and qda. In this case we don’t proceed with this model as it isn’t better in sensitivity which is what we want when predicting cancer.

Down below we run a flexible discriminant analysis (fda) which is an extension of lda using non-linear combinations of predictors (splines)

set.seed(1)

# Fit the model
DA5 <- fda(diagnosis~., data = ORIGINAL.DA.train)

# predictions prob
DA5.pred.valid.prob <- predict(DA5, ORIGINAL.DA.valid, type = "posterior")[,2]

# predictions response 
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))

# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Flexible discriminant analysis')

We see that the model has a good accuracy but isn’t good in sensitivity. Therefore we don’t use it further.

Lastly we fit a regularized discriminant analysis (RDA) which is a trade off between qda and lda.

set.seed(1)

# Fit the model
DA5 <- rda(diagnosis~., data = data.frame(ORIGINAL.DA.train))

# predictions prob
DA5.pred.valid.prob <- predict(DA5, data.frame(ORIGINAL.DA.valid))$posterior[,2]

# predictions response 
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))

# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Regularized discriminant analysis')

This model isn’t better than any other model therefore we don’t use it further.

As we have seen the highest sensitvity was in qda with acceptable accuracy we chose this model as the best one.

# Confusion Best
Best_DA_Confusion <- DA2.conf.mat

# Predictions Best
Best_DA_Predictions_Dummy <- DA2.pred.valid.resp
Best_DA_Predictions_Dummy <- factor(Best_DA_Predictions_Dummy)

Best_DA_Predictions_Probabilities <- DA2.pred.valid.prob


# Best Predictions as Data frame
DF_Best_DA_Predictions <- data.frame(Best_DA_Predictions_Dummy, Best_DA_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME

# Best Confusion as Data frame
DF_Best_DA_Confusion <- data.frame(c(Best_DA_Confusion$byClass[c(1,2)], Best_DA_Confusion$overall[1]))
colnames(DF_Best_DA_Confusion) <- "Best DA"
DF_Best_DA_Confusion <- t(DF_Best_DA_Confusion) # FINAL CONFUSION DATAFRAME

Ensemble Methods

Bagging

set.seed(1)
bagging<- bagging(diagnosis ~ ., data =Training_M)
bag_pred<- predict(bagging, Validation_M, type="class")
bag_cf <- confusionMatrix(as.factor(bag_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(bag_cf)

Boosting

set.seed(1)
boosting <- boosting(diagnosis ~ ., data = Training_M)
boost_pred<- predict(boosting, Validation_M, type="class")
boost_cf <- confusionMatrix(as.factor(boost_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(boost_cf)

Random Forests

set.seed(1)
rand_f <- randomForest(diagnosis ~ ., data = Training_M, mtry=4, importance = T)
varImpPlot(rand_f, type=1,cex = 0.7) # we print out the variable importance plot too

rf_pred<- predict(rand_f, Validation_M, type="class")
rf_cf <- confusionMatrix(as.factor(rf_pred), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(rf_cf)

ROC curves

 # For the boosting
response_boost <- data.frame(Validation_M[,1], boost_pred$class) 
response_boost$Validation_M...1.<- as.numeric(as.character(response_boost[,1]))
response_boost$boost_pred.class<- as.numeric(as.character(response_boost[,2]))
roc_score_boost <- roc(data= response_boost , response=Validation_M...1., boost_pred.class) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For bagging
response_bag <- data.frame(Validation_M[,1], bag_pred$class) 
response_bag$Validation_M...1.<- as.numeric(as.character(response_bag[,1]))
response_bag$bag_pred.class<- as.numeric(as.character(response_bag[,2]))
roc_score_bag <- roc(data= response_bag , response=Validation_M...1., bag_pred.class) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For Random Forests
response_rf <- data.frame(Validation_M[,1], rf_pred) 
response_rf$Validation_M...1.<- as.numeric(as.character(response_rf[,1]))
response_rf$rf_pred<- as.numeric(as.character(response_rf[,2]))
roc_score_rf <- roc(data= response_rf , response=Validation_M...1., rf_pred) #AUC score
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curves side by side
par(mfrow=c(1,3)) 
plot(roc_score_boost ,main ="ROC curve for Boosting", cex= 1)
plot(roc_score_bag ,main ="ROC curve for Bagging", cex=1)
plot(roc_score_rf ,main ="ROC curve for Random Forests", cex=1)

Best Model - Boosting Trees

# Best tree model is the boosted tree
# Best tree confusion matrix
best_boosted_confusion <- boost_cf

#predictions for best confusion matrix
best_boosted_pred <- boost_pred

# Probabilities of predictions
best_boosted_prob<- predict(boosting, Validation_M, type="prob")

# Prediction and probabilities data-frame
DF_best_boosted_pred <- data.frame(best_boosted_pred$class,best_boosted_prob$prob[,2])

#table of Confusion matrix values
DF_best_boosted_confusion <- data.frame(c(best_boosted_confusion$byClass[c(1,2)], best_boosted_confusion$overall[1]))
colnames(DF_best_boosted_confusion) <- "Boosted Tree Model"
DF_best_boosted_confusion<- t(DF_best_boosted_confusion)

Best 5 Models

##     Best_Logistic_Predictions_Dummy Best_Logistic_Predictions_Probabilities
## 1                                 1                          0.999998884279
## 2                                 1                          0.764995713849
## 3                                 1                          0.968538861355
## 4                                 1                          0.931515195197
## 5                                 1                          0.990662066546
## 6                                 1                          0.875723102074
## 7                                 0                          0.309018569943
## 8                                 1                          0.999321090298
## 9                                 0                          0.371587814346
## 10                                1                          0.999067567458
## 11                                1                          0.675002354640
## 12                                0                          0.000077823130
## 13                                0                          0.079744986206
## 14                                1                          0.999998815292
## 15                                0                          0.447995457713
## 16                                0                          0.447956038786
## 17                                1                          0.999996624809
## 18                                0                          0.001096642226
## 19                                0                          0.061921993277
## 20                                1                          0.914933882168
## 21                                0                          0.000487539786
## 22                                1                          0.723279433991
## 23                                0                          0.002426941623
## 24                                1                          0.999999996385
## 25                                1                          0.695308338264
## 26                                0                          0.003461771367
## 27                                0                          0.385131694427
## 28                                0                          0.005757588165
## 29                                1                          0.996757181863
## 30                                1                          0.999982530644
## 31                                1                          0.832397618429
## 32                                0                          0.105144180129
## 33                                0                          0.000073595543
## 34                                0                          0.009073336233
## 35                                0                          0.022126770357
## 36                                0                          0.000050824722
## 37                                1                          0.994816310902
## 38                                0                          0.008170151323
## 39                                0                          0.228811168809
## 40                                1                          0.999973242032
## 41                                0                          0.009449468658
## 42                                1                          0.723819821987
## 43                                0                          0.022136113515
## 44                                0                          0.000158845010
## 45                                0                          0.000796344731
## 46                                0                          0.000623730356
## 47                                0                          0.000013039320
## 48                                1                          0.862758616151
## 49                                0                          0.000052000219
## 50                                0                          0.045376732585
## 51                                0                          0.003793978558
## 52                                0                          0.003023703028
## 53                                0                          0.140343168303
## 54                                1                          0.994900095799
## 55                                0                          0.020250130231
## 56                                1                          0.997278972177
## 57                                0                          0.007964105607
## 58                                0                          0.001746447355
## 59                                1                          0.997661830162
## 60                                0                          0.000064475980
## 61                                0                          0.000056528523
## 62                                1                          0.998427484755
## 63                                0                          0.006931465439
## 64                                0                          0.000578360576
## 65                                1                          0.992294603988
## 66                                0                          0.006197709274
## 67                                1                          0.975214547873
## 68                                1                          0.999999999921
## 69                                1                          0.691051595907
## 70                                0                          0.000473138075
## 71                                0                          0.170343180410
## 72                                1                          1.000000000000
## 73                                0                          0.014410223017
## 74                                1                          0.951609407809
## 75                                0                          0.176230468310
## 76                                1                          0.999970030499
## 77                                0                          0.017365719016
## 78                                1                          0.999999999788
## 79                                1                          0.999900452658
## 80                                0                          0.000180884137
## 81                                1                          0.999999171928
## 82                                1                          0.998460014879
## 83                                1                          0.999992661214
## 84                                0                          0.087245511902
## 85                                1                          0.999615924199
## 86                                0                          0.001936983381
## 87                                0                          0.003252824380
## 88                                0                          0.018134138230
## 89                                0                          0.000975576735
## 90                                0                          0.001084531369
## 91                                0                          0.002616531588
## 92                                1                          0.999986405229
## 93                                0                          0.000190604286
## 94                                0                          0.000189016498
## 95                                0                          0.003412507401
## 96                                0                          0.000073217293
## 97                                1                          0.977909201124
## 98                                0                          0.000382518300
## 99                                0                          0.010933841771
## 100                               0                          0.004309603619
## 101                               0                          0.000220894978
## 102                               1                          1.000000000000
## 103                               0                          0.004122332728
## 104                               0                          0.007017374293
## 105                               1                          0.999999812265
## 106                               0                          0.054930762660
## 107                               0                          0.000473696893
## 108                               0                          0.001822900621
## 109                               0                          0.001913543870
## 110                               1                          0.999999988169
## 111                               0                          0.137803855881
## 112                               1                          0.999969986215
## 113                               0                          0.001615127586
## 114                               0                          0.000533952997
## 115                               0                          0.000007471704
## 116                               0                          0.453693547937
## 117                               0                          0.115060254023
## 118                               0                          0.006452918089
## 119                               0                          0.473136989065
## 120                               0                          0.003164996296
## 121                               1                          0.999847047350
## 122                               0                          0.004296573993
## 123                               0                          0.000906813205
## 124                               0                          0.000057345126
## 125                               0                          0.005165357314
## 126                               0                          0.005150291371
## 127                               0                          0.001804794845
## 128                               0                          0.062674062724
## 129                               0                          0.336494350429
## 130                               0                          0.152993823354
## 131                               1                          0.992930485924
## 132                               0                          0.000517600421
## 133                               1                          0.875947037782
## 134                               0                          0.006788665821
## 135                               0                          0.008687277132
## 136                               0                          0.057282662110
## 137                               1                          0.862352744392
## 138                               0                          0.021399111264
## 139                               1                          0.999999991490
## 140                               1                          0.884513809822
## 141                               1                          0.998506675406
## 142                               0                          0.049946876869
## 143                               0                          0.194272366445
## 144                               1                          0.999211716151
## 145                               1                          0.999890029439
## 146                               0                          0.009781052378
## 147                               1                          0.999999999954
## 148                               0                          0.009735025868
## 149                               0                          0.002310519659
## 150                               0                          0.000237651172
## 151                               0                          0.085351246061
## 152                               0                          0.000021794342
## 153                               0                          0.000268973544
## 154                               0                          0.000126063015
## 155                               0                          0.007535119744
## 156                               0                          0.028136026770
## 157                               1                          0.999999744789
## 158                               1                          0.999896670077
## 159                               1                          0.999999983750
##                                Sensitivity Specificity  Accuracy
## Best Logistic Lasso Regression   0.8181818   0.9569892 0.8993711
##     Best_KNN_Predictions_Dummy Best_KNN_Predictions_Probabilities
## 1                            1                          1.0000000
## 2                            1                          0.8333333
## 3                            1                          1.0000000
## 4                            1                          1.0000000
## 5                            1                          1.0000000
## 6                            1                          0.6666667
## 7                            0                          0.0000000
## 8                            1                          1.0000000
## 9                            1                          0.8333333
## 10                           1                          1.0000000
## 11                           1                          0.8333333
## 12                           1                          0.5000000
## 13                           0                          0.3333333
## 14                           1                          1.0000000
## 15                           1                          0.6666667
## 16                           0                          0.5000000
## 17                           1                          1.0000000
## 18                           0                          0.0000000
## 19                           0                          0.0000000
## 20                           1                          1.0000000
## 21                           0                          0.0000000
## 22                           1                          1.0000000
## 23                           0                          0.0000000
## 24                           1                          1.0000000
## 25                           1                          0.8333333
## 26                           0                          0.0000000
## 27                           1                          0.8333333
## 28                           0                          0.0000000
## 29                           1                          1.0000000
## 30                           1                          1.0000000
## 31                           0                          0.1666667
## 32                           0                          0.5000000
## 33                           0                          0.0000000
## 34                           0                          0.0000000
## 35                           0                          0.0000000
## 36                           0                          0.0000000
## 37                           1                          1.0000000
## 38                           0                          0.0000000
## 39                           0                          0.3333333
## 40                           1                          1.0000000
## 41                           0                          0.0000000
## 42                           0                          0.0000000
## 43                           0                          0.0000000
## 44                           0                          0.1666667
## 45                           0                          0.0000000
## 46                           0                          0.0000000
## 47                           0                          0.0000000
## 48                           1                          1.0000000
## 49                           0                          0.0000000
## 50                           0                          0.0000000
## 51                           0                          0.0000000
## 52                           0                          0.0000000
## 53                           0                          0.0000000
## 54                           1                          1.0000000
## 55                           0                          0.0000000
## 56                           1                          1.0000000
## 57                           0                          0.0000000
## 58                           0                          0.0000000
## 59                           1                          1.0000000
## 60                           0                          0.0000000
## 61                           0                          0.0000000
## 62                           1                          0.8333333
## 63                           0                          0.0000000
## 64                           0                          0.0000000
## 65                           1                          1.0000000
## 66                           0                          0.0000000
## 67                           1                          0.8333333
## 68                           1                          1.0000000
## 69                           0                          0.1666667
## 70                           0                          0.0000000
## 71                           0                          0.0000000
## 72                           1                          1.0000000
## 73                           0                          0.0000000
## 74                           1                          1.0000000
## 75                           0                          0.0000000
## 76                           1                          1.0000000
## 77                           0                          0.0000000
## 78                           1                          1.0000000
## 79                           1                          1.0000000
## 80                           0                          0.0000000
## 81                           1                          1.0000000
## 82                           1                          0.8333333
## 83                           1                          1.0000000
## 84                           0                          0.1666667
## 85                           1                          1.0000000
## 86                           0                          0.0000000
## 87                           0                          0.0000000
## 88                           0                          0.0000000
## 89                           0                          0.0000000
## 90                           0                          0.0000000
## 91                           0                          0.0000000
## 92                           1                          1.0000000
## 93                           0                          0.0000000
## 94                           0                          0.0000000
## 95                           0                          0.0000000
## 96                           0                          0.0000000
## 97                           1                          0.6666667
## 98                           0                          0.0000000
## 99                           0                          0.0000000
## 100                          0                          0.0000000
## 101                          0                          0.0000000
## 102                          1                          1.0000000
## 103                          0                          0.0000000
## 104                          0                          0.0000000
## 105                          1                          1.0000000
## 106                          0                          0.0000000
## 107                          0                          0.0000000
## 108                          0                          0.0000000
## 109                          0                          0.0000000
## 110                          1                          1.0000000
## 111                          0                          0.1666667
## 112                          1                          1.0000000
## 113                          0                          0.0000000
## 114                          0                          0.0000000
## 115                          0                          0.0000000
## 116                          0                          0.3333333
## 117                          0                          0.0000000
## 118                          0                          0.0000000
## 119                          1                          0.5000000
## 120                          0                          0.0000000
## 121                          1                          1.0000000
## 122                          0                          0.0000000
## 123                          0                          0.0000000
## 124                          0                          0.0000000
## 125                          0                          0.0000000
## 126                          0                          0.0000000
## 127                          0                          0.0000000
## 128                          0                          0.0000000
## 129                          0                          0.1666667
## 130                          0                          0.0000000
## 131                          1                          0.6666667
## 132                          0                          0.0000000
## 133                          0                          0.1666667
## 134                          0                          0.1666667
## 135                          0                          0.0000000
## 136                          0                          0.1666667
## 137                          1                          0.8333333
## 138                          0                          0.0000000
## 139                          1                          1.0000000
## 140                          0                          0.0000000
## 141                          1                          1.0000000
## 142                          0                          0.0000000
## 143                          0                          0.1666667
## 144                          1                          1.0000000
## 145                          1                          1.0000000
## 146                          0                          0.0000000
## 147                          1                          1.0000000
## 148                          0                          0.0000000
## 149                          0                          0.1666667
## 150                          0                          0.0000000
## 151                          0                          0.0000000
## 152                          0                          0.0000000
## 153                          0                          0.0000000
## 154                          0                          0.0000000
## 155                          0                          0.0000000
## 156                          0                          0.1666667
## 157                          1                          1.0000000
## 158                          1                          1.0000000
## 159                          1                          1.0000000
##          Sensitivity Specificity  Accuracy
## Best KNN   0.8484848   0.9784946 0.9245283
##     best_boosted_pred.class best_boosted_prob.prob...2.
## 1                         1                 1.000000000
## 2                         1                 0.774589153
## 3                         1                 0.900655350
## 4                         1                 0.840310897
## 5                         1                 0.840552805
## 6                         1                 0.975346710
## 7                         0                 0.173319532
## 8                         1                 0.770050514
## 9                         1                 0.813777944
## 10                        1                 1.000000000
## 11                        1                 0.666069277
## 12                        0                 0.485140647
## 13                        0                 0.472907382
## 14                        1                 0.867183905
## 15                        1                 0.827485860
## 16                        1                 0.551442850
## 17                        1                 0.943153946
## 18                        0                 0.104094062
## 19                        0                 0.149309845
## 20                        1                 0.896966945
## 21                        0                 0.056507886
## 22                        1                 0.953335671
## 23                        0                 0.027904221
## 24                        1                 0.888562394
## 25                        1                 0.570923920
## 26                        0                 0.086972061
## 27                        0                 0.350104997
## 28                        0                 0.176722195
## 29                        1                 0.968597971
## 30                        1                 0.957678122
## 31                        1                 0.650022797
## 32                        1                 0.510345761
## 33                        0                 0.022008290
## 34                        0                 0.066225986
## 35                        0                 0.217549506
## 36                        0                 0.125683900
## 37                        1                 0.926877683
## 38                        0                 0.082787917
## 39                        0                 0.476420033
## 40                        1                 0.990015244
## 41                        0                 0.094234128
## 42                        0                 0.331647435
## 43                        0                 0.119130313
## 44                        0                 0.334636340
## 45                        0                 0.074264895
## 46                        0                 0.044511696
## 47                        0                 0.095300075
## 48                        1                 0.925137213
## 49                        0                 0.018748451
## 50                        0                 0.152561423
## 51                        0                 0.087173859
## 52                        0                 0.083312358
## 53                        0                 0.122188715
## 54                        1                 0.931222911
## 55                        0                 0.034332890
## 56                        1                 0.929496528
## 57                        0                 0.115052194
## 58                        0                 0.089774503
## 59                        1                 0.969821814
## 60                        0                 0.071616017
## 61                        0                 0.122274949
## 62                        1                 0.859281159
## 63                        0                 0.089287961
## 64                        0                 0.017856928
## 65                        1                 0.744014206
## 66                        0                 0.000000000
## 67                        1                 0.823415386
## 68                        1                 0.897342351
## 69                        1                 0.556676801
## 70                        0                 0.028042743
## 71                        0                 0.190829375
## 72                        1                 0.757541299
## 73                        0                 0.051956002
## 74                        1                 0.975981357
## 75                        0                 0.263328913
## 76                        1                 0.937404465
## 77                        0                 0.073495304
## 78                        1                 0.990015244
## 79                        1                 0.918629913
## 80                        0                 0.021196511
## 81                        1                 0.981248482
## 82                        1                 0.971595461
## 83                        1                 0.990015244
## 84                        0                 0.244175154
## 85                        1                 0.970748769
## 86                        0                 0.027356306
## 87                        0                 0.072567968
## 88                        0                 0.062795569
## 89                        0                 0.104884677
## 90                        0                 0.030938610
## 91                        0                 0.072409791
## 92                        1                 0.991757675
## 93                        0                 0.039018248
## 94                        0                 0.137619467
## 95                        0                 0.031034093
## 96                        0                 0.030434358
## 97                        1                 0.641961337
## 98                        0                 0.036588813
## 99                        0                 0.039552469
## 100                       0                 0.074014837
## 101                       0                 0.017578831
## 102                       1                 0.974192288
## 103                       0                 0.051715684
## 104                       0                 0.057666986
## 105                       1                 0.964114863
## 106                       0                 0.056924331
## 107                       0                 0.042011303
## 108                       0                 0.073407002
## 109                       0                 0.000000000
## 110                       1                 1.000000000
## 111                       0                 0.495743385
## 112                       1                 0.980094569
## 113                       0                 0.009706093
## 114                       0                 0.073757104
## 115                       0                 0.022309543
## 116                       0                 0.461477113
## 117                       0                 0.354867644
## 118                       0                 0.022597626
## 119                       0                 0.334561463
## 120                       0                 0.087071908
## 121                       1                 0.958423702
## 122                       0                 0.067040824
## 123                       0                 0.177045406
## 124                       0                 0.079493545
## 125                       0                 0.168245844
## 126                       0                 0.109404005
## 127                       0                 0.130552835
## 128                       0                 0.195253417
## 129                       0                 0.360113570
## 130                       0                 0.250925079
## 131                       1                 0.810531285
## 132                       0                 0.074490889
## 133                       1                 0.593280237
## 134                       0                 0.140916383
## 135                       0                 0.046842791
## 136                       0                 0.287045790
## 137                       1                 0.670715994
## 138                       0                 0.070675559
## 139                       1                 0.936132593
## 140                       0                 0.213574145
## 141                       1                 0.892803380
## 142                       0                 0.255380052
## 143                       0                 0.453903337
## 144                       1                 1.000000000
## 145                       1                 1.000000000
## 146                       0                 0.093101229
## 147                       1                 1.000000000
## 148                       0                 0.140118064
## 149                       0                 0.271646140
## 150                       0                 0.067638368
## 151                       0                 0.323901690
## 152                       0                 0.027631463
## 153                       0                 0.059894681
## 154                       0                 0.009237847
## 155                       0                 0.101730559
## 156                       0                 0.112283477
## 157                       1                 0.959194619
## 158                       1                 0.911177622
## 159                       1                 0.938837924
##                    Sensitivity Specificity  Accuracy
## Boosted Tree Model   0.8787879   0.9784946 0.9371069
##     Best_DA_Predictions_Dummy
## 1                           1
## 2                           1
## 3                           1
## 4                           1
## 5                           1
## 6                           1
## 7                           0
## 8                           1
## 9                           1
## 10                          1
## 11                          1
## 12                          1
## 13                          1
## 14                          1
## 15                          1
## 16                          1
## 17                          1
## 18                          0
## 19                          0
## 20                          1
## 21                          0
## 22                          1
## 23                          0
## 24                          1
## 25                          1
## 26                          0
## 27                          0
## 28                          0
## 29                          1
## 30                          1
## 31                          0
## 32                          1
## 33                          0
## 34                          0
## 35                          0
## 36                          0
## 37                          1
## 38                          0
## 39                          1
## 40                          1
## 41                          0
## 42                          0
## 43                          0
## 44                          0
## 45                          0
## 46                          0
## 47                          0
## 48                          1
## 49                          0
## 50                          0
## 51                          0
## 52                          0
## 53                          0
## 54                          1
## 55                          0
## 56                          1
## 57                          0
## 58                          0
## 59                          1
## 60                          0
## 61                          0
## 62                          1
## 63                          0
## 64                          0
## 65                          1
## 66                          0
## 67                          1
## 68                          1
## 69                          1
## 70                          0
## 71                          0
## 72                          1
## 73                          0
## 74                          1
## 75                          1
## 76                          1
## 77                          0
## 78                          1
## 79                          1
## 80                          0
## 81                          1
## 82                          1
## 83                          1
## 84                          0
## 85                          1
## 86                          0
## 87                          0
## 88                          0
## 89                          0
## 90                          0
## 91                          0
## 92                          1
## 93                          0
## 94                          0
## 95                          0
## 96                          0
## 97                          1
## 98                          0
## 99                          0
## 100                         0
## 101                         0
## 102                         1
## 103                         0
## 104                         0
## 105                         1
## 106                         0
## 107                         0
## 108                         0
## 109                         0
## 110                         1
## 111                         0
## 112                         1
## 113                         0
## 114                         0
## 115                         0
## 116                         1
## 117                         1
## 118                         0
## 119                         1
## 120                         0
## 121                         1
## 122                         0
## 123                         0
## 124                         0
## 125                         0
## 126                         0
## 127                         0
## 128                         0
## 129                         0
## 130                         0
## 131                         1
## 132                         0
## 133                         1
## 134                         0
## 135                         0
## 136                         0
## 137                         1
## 138                         0
## 139                         1
## 140                         1
## 141                         1
## 142                         0
## 143                         1
## 144                         1
## 145                         1
## 146                         0
## 147                         1
## 148                         0
## 149                         0
## 150                         0
## 151                         0
## 152                         0
## 153                         0
## 154                         0
## 155                         0
## 156                         0
## 157                         1
## 158                         1
## 159                         1
##                                                                                                        Best_DA_Predictions_Probabilities
## 1   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 2   0.9999999999999975575093458246556110680103302001953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 3   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 4   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 5   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 6   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 7   0.0000710520812996915984748394490289058467169525101780891418457031250000000000000000000000000000000000000000000000000000000000000000
## 8   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 9   1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 10  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 11  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 12  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 13  0.9997780399279516672095269314013421535491943359375000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 14  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 15  0.9999999999999980015985556747182272374629974365234375000000000000000000000000000000000000000000000000000000000000000000000000000000
## 16  0.9999999999999962252417162744677625596523284912109375000000000000000000000000000000000000000000000000000000000000000000000000000000
## 17  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 18  0.0000027803801730188075838623440461105928989127278327941894531250000000000000000000000000000000000000000000000000000000000000000000
## 19  0.0010635516868209913925180654103996857884339988231658935546875000000000000000000000000000000000000000000000000000000000000000000000
## 20  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 21  0.0000000000000000040089758134594085207946800863965108874253928661346435546875000000000000000000000000000000000000000000000000000000
## 22  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 23  0.0000000000000000877229703080208169079651225885640997148584574460983276367187500000000000000000000000000000000000000000000000000000
## 24  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 25  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 26  0.0000000996704046435446652102418951812978775706142187118530273437500000000000000000000000000000000000000000000000000000000000000000
## 27  0.0000111503138261254224593028300249741846528195310384035110473632812500000000000000000000000000000000000000000000000000000000000000
## 28  0.0021567715433414235380593648727653999230824410915374755859375000000000000000000000000000000000000000000000000000000000000000000000
## 29  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 30  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 31  0.0130150785282694327754660079676796158310025930404663085937500000000000000000000000000000000000000000000000000000000000000000000000
## 32  0.9999999999733575339888602684368379414081573486328125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 33  0.0000000000000000002939943269257253841142407679853931767866015434265136718750000000000000000000000000000000000000000000000000000000
## 34  0.0000000230902453492269946742389274962903300547623075544834136962890625000000000000000000000000000000000000000000000000000000000000
## 35  0.0000000000011069818657987654447272599100848822217812994495034217834472656250000000000000000000000000000000000000000000000000000000
## 36  0.0000000000000000000000000000000000000146183446910236947277259011990224735200172290205955505371093750000000000000000000000000000000
## 37  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 38  0.0000000000000000027605226014118660095297830281424467102624475955963134765625000000000000000000000000000000000000000000000000000000
## 39  0.9999999845735978443173053165082819759845733642578125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 40  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 41  0.0000003764950487508283739467723760796502574521582573652267456054687500000000000000000000000000000000000000000000000000000000000000
## 42  0.0051374046439231994126695113322966790292412042617797851562500000000000000000000000000000000000000000000000000000000000000000000000
## 43  0.0000008272240157858174239985382669715363590512424707412719726562500000000000000000000000000000000000000000000000000000000000000000
## 44  0.0000000000002704382056290124125780618968839519311586627736687660217285156250000000000000000000000000000000000000000000000000000000
## 45  0.0000000092673017684619869646645007321694720303639769554138183593750000000000000000000000000000000000000000000000000000000000000000
## 46  0.0000000000000000000000000087245171358322165643811585411526721145492047071456909179687500000000000000000000000000000000000000000000
## 47  0.0000000000000000000000001768051300108728459336768223941760425077518448233604431152343750000000000000000000000000000000000000000000
## 48  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 49  0.0000000000000000008043797097043138683708507485903282940853387117385864257812500000000000000000000000000000000000000000000000000000
## 50  0.0000000000001832346311407060256322421043506665228051133453845977783203125000000000000000000000000000000000000000000000000000000000
## 51  0.0000000000021207670678241855551689892811495496971474494785070419311523437500000000000000000000000000000000000000000000000000000000
## 52  0.0000000000000000000000000000000000000000000000006757991436695814522518654943183946670615114271640777587890625000000000000000000000
## 53  0.0303452790768879709926153509513824246823787689208984375000000000000000000000000000000000000000000000000000000000000000000000000000
## 54  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 55  0.0000000004343661207081146368065571738981134330970235168933868408203125000000000000000000000000000000000000000000000000000000000000
## 56  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 57  0.0000000061411735570359950800445325569398846710100769996643066406250000000000000000000000000000000000000000000000000000000000000000
## 58  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008757152
## 59  0.9999994164038339405919941782485693693161010742187500000000000000000000000000000000000000000000000000000000000000000000000000000000
## 60  0.0000000000000000116167084200637080334227169142735647255904041230678558349609375000000000000000000000000000000000000000000000000000
## 61  0.0000000000000000448231454215971346181912604578201353433541953563690185546875000000000000000000000000000000000000000000000000000000
## 62  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 63  0.0000024502535460829765246761058339330929811694659292697906494140625000000000000000000000000000000000000000000000000000000000000000
## 64  0.0000000000011423855564940835118775686041203698550816625356674194335937500000000000000000000000000000000000000000000000000000000000
## 65  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 66  0.0000000001414101770339002895737062770464831373828928917646408081054687500000000000000000000000000000000000000000000000000000000000
## 67  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 68  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 69  0.9999999999998270272527634006110019981861114501953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 70  0.0000000000000000116227384358952179083807054738031183660496026277542114257812500000000000000000000000000000000000000000000000000000
## 71  0.0263144216679913334200335839341278187930583953857421875000000000000000000000000000000000000000000000000000000000000000000000000000
## 72  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 73  0.0000000013929124464690875896850930093506804041680879890918731689453125000000000000000000000000000000000000000000000000000000000000
## 74  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 75  0.9810254462575809242252944386564195156097412109375000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 76  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 77  0.0000019942506748477742234762488671151459129760041832923889160156250000000000000000000000000000000000000000000000000000000000000000
## 78  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 79  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 80  0.0000000000351715654907029313469904452382763793139019981026649475097656250000000000000000000000000000000000000000000000000000000000
## 81  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 82  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 83  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 84  0.0274363140005409074373421418613361311145126819610595703125000000000000000000000000000000000000000000000000000000000000000000000000
## 85  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 86  0.0000000008457301625076878005479730227023082989035174250602722167968750000000000000000000000000000000000000000000000000000000000000
## 87  0.0000118797690332885366519891090497651475743623450398445129394531250000000000000000000000000000000000000000000000000000000000000000
## 88  0.0000018374743936530151301039781586155186232645064592361450195312500000000000000000000000000000000000000000000000000000000000000000
## 89  0.0000000000012144522442154906622996043963347290173260262235999107360839843750000000000000000000000000000000000000000000000000000000
## 90  0.0000000014009140770926996696736643865932592234457843005657196044921875000000000000000000000000000000000000000000000000000000000000
## 91  0.0000000029318132008440507733568192172413091611815616488456726074218750000000000000000000000000000000000000000000000000000000000000
## 92  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 93  0.0000000000815580245323561980191895415259750734549015760421752929687500000000000000000000000000000000000000000000000000000000000000
## 94  0.0000000000000000000000002360773661836484148702358654148270034056622534990310668945312500000000000000000000000000000000000000000000
## 95  0.0000000000000000930329034072286089815384535484099615132436156272888183593750000000000000000000000000000000000000000000000000000000
## 96  0.0000000000000005574846518635445458307586807933375894208438694477081298828125000000000000000000000000000000000000000000000000000000
## 97  1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 98  0.0000000000000027444488122209030548769720381230285966012161225080490112304687500000000000000000000000000000000000000000000000000000
## 99  0.0000000003613146156164692752608622039467434206017060205340385437011718750000000000000000000000000000000000000000000000000000000000
## 100 0.0000000000012768770254743897038691990974967893635039217770099639892578125000000000000000000000000000000000000000000000000000000000
## 101 0.0000000000000000000261773200721502965083314906635791885491926223039627075195312500000000000000000000000000000000000000000000000000
## 102 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 103 0.0000000006083382439248878489033989236389743382460437715053558349609375000000000000000000000000000000000000000000000000000000000000
## 104 0.0000003353599720048645211299687129979929522960446774959564208984375000000000000000000000000000000000000000000000000000000000000000
## 105 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 106 0.0000016041792108189707474243113072276400998816825449466705322265625000000000000000000000000000000000000000000000000000000000000000
## 107 0.0000000000000000497518729627603853351430762685936315392609685659408569335937500000000000000000000000000000000000000000000000000000
## 108 0.0000000000000000018424143769188722083440801213782833656296133995056152343750000000000000000000000000000000000000000000000000000000
## 109 0.0000000000015525206102598691707949302154290194266650360077619552612304687500000000000000000000000000000000000000000000000000000000
## 110 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 111 0.0000046403604357248198759682000780202315581846050918102264404296875000000000000000000000000000000000000000000000000000000000000000
## 112 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 113 0.0000000000054447604066151702997475303469343543838476762175559997558593750000000000000000000000000000000000000000000000000000000000
## 114 0.0000000000000000854468874593949442950574502120275610650423914194107055664062500000000000000000000000000000000000000000000000000000
## 115 0.0000000000000000000000000328478098195932424578635799861103805596940219402313232421875000000000000000000000000000000000000000000000
## 116 0.9631478107009148192929615106550045311450958251953125000000000000000000000000000000000000000000000000000000000000000000000000000000
## 117 0.9632965163312937617590137051593046635389328002929687500000000000000000000000000000000000000000000000000000000000000000000000000000
## 118 0.0000000520810023534324404315826173572645529930014163255691528320312500000000000000000000000000000000000000000000000000000000000000
## 119 0.9997615940675353973787764516600873321294784545898437500000000000000000000000000000000000000000000000000000000000000000000000000000
## 120 0.0000000047096392859003487728342118856517117819748818874359130859375000000000000000000000000000000000000000000000000000000000000000
## 121 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 122 0.0000000001868125427808899112567392331030191598983947187662124633789062500000000000000000000000000000000000000000000000000000000000
## 123 0.0000000000567889963290498762330665849518140930740628391504287719726562500000000000000000000000000000000000000000000000000000000000
## 124 0.0000000000000000000000000000000000000000958426943095521203554165712290568990283645689487457275390625000000000000000000000000000000
## 125 0.0000014772734178086702426773574314822212727449368685483932495117187500000000000000000000000000000000000000000000000000000000000000
## 126 0.0000000000002627735229406056433098040625573332818021299317479133605957031250000000000000000000000000000000000000000000000000000000
## 127 0.0000000101304546106378335571657212099694334028754383325576782226562500000000000000000000000000000000000000000000000000000000000000
## 128 0.1220332778200839546345335406840604264289140701293945312500000000000000000000000000000000000000000000000000000000000000000000000000
## 129 0.0000000000803487315692802863548171998786529002245515584945678710937500000000000000000000000000000000000000000000000000000000000000
## 130 0.0453058045180589480382948863734782207757234573364257812500000000000000000000000000000000000000000000000000000000000000000000000000
## 131 0.9999999753534921653752576276019681245088577270507812500000000000000000000000000000000000000000000000000000000000000000000000000000
## 132 0.0000000031907750854301417172288052315209938569751102477312088012695312500000000000000000000000000000000000000000000000000000000000
## 133 0.9098006723928865335793148005905095487833023071289062500000000000000000000000000000000000000000000000000000000000000000000000000000
## 134 0.0000000000000000000000000000000001151321157572149220553112125564609868888510391116142272949218750000000000000000000000000000000000
## 135 0.0000004996853867304061334961989837566420646908227354288101196289062500000000000000000000000000000000000000000000000000000000000000
## 136 0.0051162367150289550163377860769742255797609686851501464843750000000000000000000000000000000000000000000000000000000000000000000000
## 137 0.9999999999901685310277343887719325721263885498046875000000000000000000000000000000000000000000000000000000000000000000000000000000
## 138 0.0000000000394102933786781959336897873669158798293210566043853759765625000000000000000000000000000000000000000000000000000000000000
## 139 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 140 0.9994345011748987372968144882179331034421920776367187500000000000000000000000000000000000000000000000000000000000000000000000000000
## 141 0.9999999999999593658372987192706204950809478759765625000000000000000000000000000000000000000000000000000000000000000000000000000000
## 142 0.0007332173650095377692714748718572081997990608215332031250000000000000000000000000000000000000000000000000000000000000000000000000
## 143 0.9999989810572786907982845150399953126907348632812500000000000000000000000000000000000000000000000000000000000000000000000000000000
## 144 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 145 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 146 0.0000000000470037617179969324527483154341211957216728478670120239257812500000000000000000000000000000000000000000000000000000000000
## 147 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 148 0.0000002317276715220916731043099279219177333288826048374176025390625000000000000000000000000000000000000000000000000000000000000000
## 149 0.0000000849626241133124053550584875438289600424468517303466796875000000000000000000000000000000000000000000000000000000000000000000
## 150 0.0000000000000000000000000000000000000000000000000000000000000000001074880156031621173252747769577553071940201334655284881591796875
## 151 0.0008170369274782562397929641662130961776711046695709228515625000000000000000000000000000000000000000000000000000000000000000000000
## 152 0.0000000000000000000000265610828941182368833837101895767318637808784842491149902343750000000000000000000000000000000000000000000000
## 153 0.0000000000000000215275713379214336286315845114813782856799662113189697265625000000000000000000000000000000000000000000000000000000
## 154 0.0000000000000448203982122825164398707831203694240684853866696357727050781250000000000000000000000000000000000000000000000000000000
## 155 0.0000000004528835437026721726615219321132599361590109765529632568359375000000000000000000000000000000000000000000000000000000000000
## 156 0.0000142084500430176112343097494239430034213000908493995666503906250000000000000000000000000000000000000000000000000000000000000000
## 157 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 158 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## 159 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
##         Sensitivity Specificity  Accuracy
## Best DA   0.9393939   0.9354839 0.9371069
##     Best_Neural_Network_Predictions_Dummy
## 1                                       1
## 2                                       1
## 3                                       1
## 4                                       1
## 5                                       1
## 6                                       1
## 7                                       0
## 8                                       1
## 9                                       1
## 10                                      1
## 11                                      1
## 12                                      1
## 13                                      1
## 14                                      1
## 15                                      1
## 16                                      1
## 17                                      1
## 18                                      0
## 19                                      0
## 20                                      1
## 21                                      0
## 22                                      1
## 23                                      0
## 24                                      1
## 25                                      1
## 26                                      0
## 27                                      1
## 28                                      0
## 29                                      1
## 30                                      1
## 31                                      1
## 32                                      1
## 33                                      0
## 34                                      0
## 35                                      0
## 36                                      0
## 37                                      1
## 38                                      0
## 39                                      1
## 40                                      1
## 41                                      0
## 42                                      0
## 43                                      0
## 44                                      0
## 45                                      0
## 46                                      0
## 47                                      0
## 48                                      1
## 49                                      0
## 50                                      0
## 51                                      0
## 52                                      0
## 53                                      0
## 54                                      1
## 55                                      0
## 56                                      1
## 57                                      0
## 58                                      0
## 59                                      1
## 60                                      0
## 61                                      0
## 62                                      1
## 63                                      0
## 64                                      0
## 65                                      1
## 66                                      0
## 67                                      1
## 68                                      1
## 69                                      1
## 70                                      0
## 71                                      0
## 72                                      1
## 73                                      0
## 74                                      1
## 75                                      0
## 76                                      1
## 77                                      0
## 78                                      1
## 79                                      1
## 80                                      0
## 81                                      1
## 82                                      1
## 83                                      1
## 84                                      0
## 85                                      1
## 86                                      0
## 87                                      0
## 88                                      0
## 89                                      0
## 90                                      0
## 91                                      0
## 92                                      1
## 93                                      0
## 94                                      0
## 95                                      0
## 96                                      0
## 97                                      1
## 98                                      0
## 99                                      0
## 100                                     0
## 101                                     0
## 102                                     1
## 103                                     0
## 104                                     0
## 105                                     1
## 106                                     0
## 107                                     0
## 108                                     0
## 109                                     0
## 110                                     1
## 111                                     0
## 112                                     1
## 113                                     0
## 114                                     0
## 115                                     0
## 116                                     0
## 117                                     1
## 118                                     0
## 119                                     0
## 120                                     0
## 121                                     1
## 122                                     0
## 123                                     0
## 124                                     0
## 125                                     1
## 126                                     0
## 127                                     0
## 128                                     0
## 129                                     0
## 130                                     0
## 131                                     1
## 132                                     0
## 133                                     0
## 134                                     0
## 135                                     0
## 136                                     0
## 137                                     1
## 138                                     0
## 139                                     1
## 140                                     0
## 141                                     1
## 142                                     0
## 143                                     1
## 144                                     1
## 145                                     1
## 146                                     0
## 147                                     1
## 148                                     0
## 149                                     0
## 150                                     0
## 151                                     0
## 152                                     0
## 153                                     0
## 154                                     0
## 155                                     0
## 156                                     0
## 157                                     1
## 158                                     1
## 159                                     1
##     Best_Neural_Network_Predictions_Probabilities
## 1                                   0.99999369785
## 2                                   0.99995178208
## 3                                   0.99967082496
## 4                                   0.99989008907
## 5                                   0.99857862853
## 6                                   0.99999912764
## 7                                   0.00078424028
## 8                                   0.99058084821
## 9                                   0.99994862982
## 10                                  0.99953126797
## 11                                  0.99878830428
## 12                                  0.72952214013
## 13                                  0.99390807526
## 14                                  0.99781170241
## 15                                  0.99996796688
## 16                                  0.99995164895
## 17                                  0.99999599600
## 18                                  0.00558704745
## 19                                  0.00864241303
## 20                                  0.99998063324
## 21                                  0.00096614613
## 22                                  0.99996961811
## 23                                  0.00064219188
## 24                                  0.99937992989
## 25                                  0.73194142906
## 26                                  0.00007041882
## 27                                  0.88763184702
## 28                                  0.06256823669
## 29                                  0.99999330653
## 30                                  0.99997850011
## 31                                  0.94204935601
## 32                                  0.99999425033
## 33                                  0.00016598179
## 34                                  0.00148147353
## 35                                  0.00032244816
## 36                                  0.00043893643
## 37                                  0.99877808534
## 38                                  0.00047681857
## 39                                  0.99996851426
## 40                                  0.99998012687
## 41                                  0.00010367647
## 42                                  0.01692568339
## 43                                  0.00159618549
## 44                                  0.21706307688
## 45                                  0.00036450055
## 46                                  0.00040508702
## 47                                  0.00007923638
## 48                                  0.99999750162
## 49                                  0.00062170058
## 50                                  0.00018794521
## 51                                  0.00067805904
## 52                                  0.00019123316
## 53                                  0.00391673363
## 54                                  0.99999693290
## 55                                  0.00143229503
## 56                                  0.99999745969
## 57                                  0.00011794213
## 58                                  0.00012251458
## 59                                  0.99959655470
## 60                                  0.00155967132
## 61                                  0.00078002120
## 62                                  0.99989893827
## 63                                  0.00208466800
## 64                                  0.00051810352
## 65                                  0.93555293905
## 66                                  0.00034061069
## 67                                  0.99986030865
## 68                                  0.99998968257
## 69                                  0.63704553416
## 70                                  0.00039362585
## 71                                  0.00430321597
## 72                                  0.99999699122
## 73                                  0.00008087768
## 74                                  0.99996008706
## 75                                  0.10974143410
## 76                                  0.99999034491
## 77                                  0.00114921225
## 78                                  0.99999698967
## 79                                  0.99998295925
## 80                                  0.00051092692
## 81                                  0.99999514751
## 82                                  0.99994028448
## 83                                  0.99999238899
## 84                                  0.01615532669
## 85                                  0.99998819840
## 86                                  0.00206753151
## 87                                  0.00005984042
## 88                                  0.00047214295
## 89                                  0.00183945840
## 90                                  0.00054016441
## 91                                  0.00046749024
## 92                                  0.99999704596
## 93                                  0.00008849256
## 94                                  0.00401458187
## 95                                  0.00045657924
## 96                                  0.00056996081
## 97                                  0.99252959781
## 98                                  0.00044044525
## 99                                  0.00021118419
## 100                                 0.00060570614
## 101                                 0.00032680896
## 102                                 0.99999510902
## 103                                 0.00046435962
## 104                                 0.00033933671
## 105                                 0.99999705865
## 106                                 0.00052605403
## 107                                 0.00008015152
## 108                                 0.00086388358
## 109                                 0.00036713027
## 110                                 0.99999628406
## 111                                 0.10525678305
## 112                                 0.99988635872
## 113                                 0.00526028556
## 114                                 0.00028463503
## 115                                 0.00010447583
## 116                                 0.41176600162
## 117                                 0.98899476247
## 118                                 0.00008796902
## 119                                 0.07463847274
## 120                                 0.00126443964
## 121                                 0.99999616707
## 122                                 0.00015896985
## 123                                 0.00065866409
## 124                                 0.00001877255
## 125                                 0.50775408491
## 126                                 0.02588968679
## 127                                 0.00172216804
## 128                                 0.01476399870
## 129                                 0.10461971603
## 130                                 0.01896159103
## 131                                 0.75518213831
## 132                                 0.00130404413
## 133                                 0.40005131372
## 134                                 0.00012104954
## 135                                 0.00056154789
## 136                                 0.26018054706
## 137                                 0.99930499071
## 138                                 0.00032076466
## 139                                 0.99999599392
## 140                                 0.00503075742
## 141                                 0.99962126343
## 142                                 0.00868131201
## 143                                 0.94541060757
## 144                                 0.99999549248
## 145                                 0.99998950853
## 146                                 0.00260190515
## 147                                 0.99999683429
## 148                                 0.00047157738
## 149                                 0.20363102154
## 150                                 0.00031408588
## 151                                 0.12025797038
## 152                                 0.00016951203
## 153                                 0.00067645005
## 154                                 0.00020484482
## 155                                 0.00212811550
## 156                                 0.03153583689
## 157                                 0.99999699254
## 158                                 0.99999291105
## 159                                 0.99999699271
##                     Sensitivity Specificity  Accuracy
## Best Neural Network    0.969697   0.9784946 0.9748428

Majority Vote

Majority_DF <- data.frame(DF_Best_Logistic_Predictions[,1], DF_Best_KNN_Predictions[,1], DF_best_boosted_pred[,1], DF_Best_DA_Predictions[,1], DF_Best_Neural_Network_Predictions[,1])

for(i in 1:nrow(Majority_DF)){
  if(sum(Majority_DF[i,1] + Majority_DF[i,2] + Majority_DF[i,3] + Majority_DF[i,4] + Majority_DF[i,5])/5 > 0.5){
    Majority_DF[i,6] = 1
    }else{Majority_DF[i,6] = 0}
}

colnames(Majority_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Majority Vote")

Majority_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))

DT::datatable(Majority_DF, caption = "Best 5 Models on Valildation - Majority of Votes") 

Average of Models Probabilities

Average_DF <- data.frame(DF_Best_Logistic_Predictions[,2], DF_Best_KNN_Predictions[,2], DF_best_boosted_pred[,2], DF_Best_DA_Predictions[,2], DF_Best_Neural_Network_Predictions[,2])

for(i in 1:nrow(Average_DF)){
  Average_DF[i,6] <- sum(Average_DF[i,1] + Average_DF[i,2] + Average_DF[i,3] + Average_DF[i,4] + Average_DF[i,5])/5
}

colnames(Average_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Average")

Average_DF$'Average Cutoff 0.5' <- ifelse(Average_DF$`Average`>0.5,1,0)

Average_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))

DT::datatable(round(Average_DF,4), caption = "Best 5 Models on Validation - Average of Probabilities") 

Comparision of Models

# Majority vote Confusion Matrix
Majority_factor <- data.frame(as.factor(Majority_DF[,6]), as.factor(Majority_DF[,7])) 
Majority_confusion <- confusionMatrix(Majority_factor[,1], Majority_factor[,2], positive = "1")
draw_confusion_matrix(Majority_confusion)

# Average Confusion Matrix
Average_factor <- data.frame(as.factor(Average_DF[,7]), as.factor(Average_DF[,8])) 
Average_confusion <- confusionMatrix(Average_factor[,1], Average_factor[,2], positive = "1")
draw_confusion_matrix(Average_confusion)

Comments: the Average seems to be better in Accuracy and Sensitivity.

Unsupervised Learning

Cluster Analysis

K-Means Clustering

set.seed(1)

# Duplicate Original to a Cluster Dataframe
KClusteringDF <- ORIGINAL

# Make Sure to be as Dataframe
KClusteringDF <- data.frame(KClusteringDF)

# Remove the "ID" Variable
KClusteringDF <- KClusteringDF[,-1]

# Preprocess Data
Norm_Kmeans <- preProcess(KClusteringDF, method = c("center", "scale"))
KClusteringDF_Preprocess <- predict(Norm_Kmeans, KClusteringDF)

# Separate Benign and Malign into 2 Datasets
Benign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 0,]
Malign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 1,]

# Without Preprocess for Malign DF
Malign_ClusterDF_No_Scale <- KClusteringDF[KClusteringDF$diagnosis == 1,]

K-Means Clustering with all Dataset - Model 1

Let’s check if accounting for the whole dataset, we can find meaningful clusters.

set.seed(1)

# Load Library
library(factoextra)

# Labeling Tumors Type as Row Name
KClusteringDF_Preprocess$diagnosis <- factor(KClusteringDF_Preprocess$diagnosis, levels = c(0,1), labels=c("Benign","Malign"))

rownames(KClusteringDF_Preprocess) <- paste(KClusteringDF_Preprocess$diagnosis, 1:dim(KClusteringDF_Preprocess)[1], sep = "_")

# Optimal Number of Clusters
fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "wss")

fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "silhouette")

fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "gap_stat")

# Create Clusters
Cluster_ALL <- kmeans(KClusteringDF_Preprocess[,-1], centers = 2, iter.max = 100, nstart = 100)

Comments: We can find the optimal number of clusters with 3 differents methods: WSS ( Within-Cluster-Sum of Squared Errors) or also called Elbow Method, The Silhouette Method which accounts for the separation between clusters or lastly the Gap Statistic. Here all three methods give us the optimal number of 2 clusters, which make sense when taking into accounts the fact that there is either Benign or Malign type of tumors. Let’s see it graphically and how it performs with the real word.

Model 1 - Plot
set.seed(1)

# Plotting Clusters of Model 1
fviz_cluster(Cluster_ALL, data = KClusteringDF_Preprocess[,-1], main="Cluster Model 1", labelsize = 0)+ geom_text(
    label=rownames(KClusteringDF_Preprocess), 
    nudge_x = 0.25, nudge_y = 0.25, 
    check_overlap = T, size=2)

Comments: We can see that K-Means without any human intervention found 2 clusters to be optimal on the whole dataset, and separated benign and malign tumors accordingly. We could check how it performed. Here Cluster Number 1 would be the Malign Tumors and Cluser Number 2 the Benign Tumors.

Model 1 - Performance
set.seed(1)

# Let's Convert some results to comparable clusters and values
DF_Cluster_Performance <- as.data.frame(Cluster_ALL$cluster)
DF_Cluster_Performance <- cbind(DF_Cluster_Performance, KClusteringDF_Preprocess$diagnosis)
rownames(DF_Cluster_Performance) <- c(1:dim(DF_Cluster_Performance))
## Warning in 1:dim(DF_Cluster_Performance): espressione numerica con 2 elementi:
## solo il primo è utilizzato
DF_Cluster_Performance$`Cluster_ALL$cluster` <- ifelse(DF_Cluster_Performance$`Cluster_ALL$cluster` == 1, 1,0)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- ifelse(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` == "Malign",1,0)

# Convert as Factor the Binary Outcomes
DF_Cluster_Performance$`Cluster_ALL$cluster` <- factor(DF_Cluster_Performance$`Cluster_ALL$cluster`)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- factor(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`)

# Confusion Matrix
Confusion_Matrix_K_Means1 <- confusionMatrix(data = DF_Cluster_Performance$`Cluster_ALL$cluster`, reference = DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`,positive = "1")

# Create the Function for Confusion Matrix
draw_confusion_matrix_K_Means1 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for K-Means - Model 1', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'Benign', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Malignant', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'Benign', cex=1.2, srt=90)
  text(140, 335, 'Malignant', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_K_Means1(Confusion_Matrix_K_Means1)

Comments: We can appreciate the K-Means algorithm to have found 2 types of differents tumours in our dataset (Malign and Benign), showing that we do have good seperability from our features. The Accuracy is not quite good, again the clustering is not aimed at being good in predictions but rather show some insights about the dataset mixed with some knowledge in the field.

K-Means Clustering with only Malign Tumors - Model 2

Since Cancerous Tumors in the Breasts are not all equal, some of them being at different stages or type, we could apply the K-Means Clustering Model to find out if there is some separations among them and if could suggest an priory number of cluster to further analysis for medical researches.

Types of Breast Cancer - American Society

Breast Cancer Stages

For examples, we can find this article about how stages are rated:

“In both staging systems, 7 key pieces of information are used:

  1. The extent (size) of the tumor (T): How large is the cancer? Has it grown into nearby areas?
  2. The spread to nearby lymph nodes (N): Has the cancer spread to nearby lymph nodes? If so, how many?
  3. The spread (metastasis) to distant sites (M): Has the cancer spread to distant organs such as the lungs or liver?
  4. Estrogen Receptor (ER) status: Does the cancer have the protein called an estrogen receptor?
  5. Progesterone Receptor (PR) status: Does the cancer have the protein called a progesterone receptor?
  6. HER2 status: Does the cancer make too much of a protein called HER2?
  7. Grade of the cancer (G): How much do the cancer cells look like normal cells?”

We can see that we lack a lot of information only using this dataset, we could only infer the size of the cancer based on 1 tumor, without nearby information. Thus we will be limited in the clustering method to only size as an information for the stage of the tumor.

set.seed(1)

# Load Library
library(factoextra)

# Labeling Tumors Type as Row Name
Malign_ClusterDF$diagnosis <- factor(Malign_ClusterDF$diagnosis)

# Optimal Number of Clusters
fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "wss")

fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "silhouette")

fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "gap_stat")

# Create Clusters
Cluster_Malign_1 <- kmeans(Malign_ClusterDF[,-1], centers = 3, iter.max = 100, nstart = 100)
Cluster_Malign_2 <- kmeans(Malign_ClusterDF[,-1], centers = 2, iter.max = 100, nstart = 100)

Comments: All 3 methods don’t converge to the same number of clusters, but we can see that the Elbow Method and Silhouette Method would either say 2 or 3 groups being optimal. The Gap Statistic show no cluster separations from the Malign tumors. We can try separating into 2 and 3 clusters and see the profiling of those groups. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)

Model 2 with 3 Clusters - Plot
set.seed(1)

# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_1, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 3 Clusters", labelsize = 0)

Comments: Some overlap occurs in this 2D graphs, but considering all dimensions, there is no overlap at all. We can see that we could with human interpretation, see that there would be indeed 3 different clusters in the Malign Tumors. Let’s check the centroid.

Centroid of Model 2 with 3 Clusters
Clusters_Malign_1_Centers <- Cluster_Malign_1$centers

DT::datatable(round(Clusters_Malign_1_Centers,5), caption = "Centroid from Model 2 - 3 Clusters")

Comments: We can see that the radius_mean (perimeter_mean and area_mean are quite similar) is indeed being one variable cleary seperating the Malign Tumors into 3 clusters, concavity_mean and compactness_mean as well. smoothness_mean is also seperating clusters from each other. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)

Cluster Members with 3 Clusters
# Cluster Members
split1<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_1$cluster)

# Split Cluster to Original Malign Not Scaled DF
cluster_1 <- split1$`1`
cluster_2 <- split1$`2`
cluster_3 <- split1$`3`

# Data Table
DT::datatable(cluster_1[,-1], caption = "Cluster 1")
DT::datatable(cluster_2[,-1], caption = "Cluster 2 ")
DT::datatable(cluster_3[,-1], caption = "Cluster 3")
Boxplots for Comparison of radius_mean
# Load Libaries
library(ggpubr)
library(ggplot2)

# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
 aes(x = "", y = radius_mean) +
 geom_boxplot(fill = "#1c6155") +
 labs(title = "Cluster 1", 
 subtitle = "in millimiters") +
 theme_minimal() + ylim(10, 30)

boxcluster2 <- ggplot(cluster_2) +
 aes(x = "", y = radius_mean) +
 geom_boxplot(fill = "#1c6155") +
 labs(title = "Cluster 2", 
 subtitle = "in millimiters") +
 theme_minimal() + ylim(10, 30)

boxcluster3 <- ggplot(cluster_3) +
 aes(x = "", y = radius_mean) +
 geom_boxplot(fill = "#1c6155") +
 labs(title = "Cluster 3", 
 subtitle = "in millimiters") +
 theme_minimal() + ylim(10, 30)

ggarrange1 <- ggarrange(boxcluster1,boxcluster2,boxcluster3, ncol = 3)
annotate_figure(ggarrange1,
                top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))

Comments: We can see some tumors being greater than 20mm or between 15mm and 20mm, and lastly under 15mm. Such result in the clusters median is very intersting and knowing how the staging system is done can actually lead us to prefer the clustering into 2 groups: 1 cluster being below 20mm and the other greater or equal than 20mm. We will do as such in the following part.

Model 2 with 2 Clusters - Plot
set.seed(1)

# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_2, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 2 Clusters", labelsize = 0)

Comments: Overlapping still occurs in such 2d graphs, but we can see also the trend of 2 groups, the Red one being more spread than the blue one, and some spread also happen in the bottom center of the plot for the blue cluster.

Centroid of Model 2 with 2 Clusters
Clusters_Malign_2_Centers <- Cluster_Malign_2$centers

DT::datatable(round(Clusters_Malign_2_Centers,5), caption = "Centroid from Model 2 - 2 Clusters")

Comments: We can also see the radius_mean (perimeter_mean and area_mean) being very important in the separation, compactness_mean and concavity_mean as well. symmetry_mean is also quite different and smoothness_mean as well.

Cluster Members with 2 Clusters
# Cluster Members
split2<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_2$cluster)

# Split Cluster to Original Malign Not Scaled DF
cluster2_1 <- split2$`1`
cluster2_2 <- split2$`2`

# Data Table
DT::datatable(cluster2_1[,-1], caption = "Cluster 1")
DT::datatable(cluster2_2[,-1], caption = "Cluster 2")
Boxplots for Comparison of radius_mean
# Load Libaries
library(ggpubr)
library(ggplot2)

# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
 aes(x = "", y = radius_mean) +
 geom_boxplot(fill = "#1c6155") +
 labs(title = "Cluster 1 (T2)", 
 subtitle = "in millimiters") +
 theme_minimal() + ylim(10, 30)

boxcluster2 <- ggplot(cluster_2) +
 aes(x = "", y = radius_mean) +
 geom_boxplot(fill = "#1c6155") +
 labs(title = "Cluster 2 (T1)", 
 subtitle = "in millimiters") +
 theme_minimal() + ylim(10, 30)

ggarrange2 <- ggarrange(boxcluster1,boxcluster2)
annotate_figure(ggarrange2,
                top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))

Comments: without having the full key pieces information for the staging systems from the American Cancer Society, we can already have some metrics for the T key which is the size of the tumor, but without the nearby areas. The dataset suggest that the measure are for primary tumors only. If we look at the Cluster 1, the radius_mean median seems to be higher than 2cm or 20mm but less than 5cm or 50mm. Thus we would attribute the T2 key to this Cluster. Cluster 2 in opposite is having an median close to 1.4cm or 14mm, since this is less than 2cm ro 20mm, we could attribute the key T1 to this cluster. Nevertheless, we should remember that some member of Cluster 1 are less than 20mm, and thus we shouldn’t categorize them as T2 following the guidelines. For simplicity, we will keep those tumors in the Cluster 1 but if we wanted to decide or not wether a member is subject to T2, we should use other metrics to check the size exactitude before removing it to the T2 label.

Cluster 2 with T1 could potential lead us to such Stages:

Stage IA: The tumor is small, invasive, and has not spread to the lymph nodes (T1, N0, M0). Stage IB: Cancer has spread to the lymph nodes and the cancer in the lymph node is larger than 0.2 mm but less than 2 mm in size. There is either no evidence of a tumor in the breast or the tumor in the breast is 20 mm or smaller (T0 or T1, N1mi, M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.

Cluster 1 with T2 could potential lead us to such Stages:

Stage IIA: Any 1 of these conditions: The tumor is larger than 20 mm but not larger than 50 mm and has not spread to the axillary lymph nodes (T2, N0, M0). Stage IIB: The tumor is larger than 20 mm but not larger than 50 mm and has spread to 1 to 3 axillary lymph nodes (T2, N1, M0). Stage IIIA: The tumor of any size has spread to 4 to 9 axillary lymph nodes or to internal mammary lymph nodes. It has not spread to other parts of the body (T0, T1, T2, or T3; N2; M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.

Comments: We can see that we lack a lot of information to actually get to the actual stage of the cancerous breast tumors, depending on the source, we may lack 2 more information if we follow Cancer.Net staging system: Node (N - Has the tumor spread to the lymph nodes? If so, where, what size, and how many?) or Metastasis (M - Has the cancer spread to other parts of the body?). The American Cancer Society requires way more information, up to 7 in total plus additional recurrence test. Here is the 7 keys parameters:

image

Breast Cancer Stages - cancer.org

Breast Cancer: Stages - Cancer.Net

Proportions of T1 and T2 - Pie Chart
# Computing Proportions of T1 and T2
Proportions_T1 <- nrow(cluster2_2)/nrow(Malign_ClusterDF_No_Scale)
Proportions_T2 <- nrow(cluster2_1)/nrow(Malign_ClusterDF_No_Scale)

# Rounding Proportions
Proportions_T1 <- round(Proportions_T1,3)
Proportions_T2 <- round(Proportions_T2,3)

# Pie Chart Dataframe
Pie_T1_T2 <- data.frame(
  t = c("T1", "T2"),
  n = c(151, 61),
  prop = c(Proportions_T1, Proportions_T2))

# Pie ggplot
ggplot(Pie_T1_T2, aes(x="", y=n, fill=t)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) + theme_void() + geom_text(aes(label = paste0(100*prop, "%")), position = position_stack(vjust=0.5), color="white", size=6) +
  labs(x = NULL, y = NULL, fill = "T Category") + scale_fill_manual(values=c("#1c6155","#66807b")) + ggtitle("Pie Chart of T Category Proportions for Malign Tumors (221 obs.)")

Comments:

Conclusion

References

Logistic Regression in Machine Learning

Convergence Error in Logistic Regression

Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net

Lasso Regression in R (Step-by-Step)

ROC Curve

How to create a ROC curve in R

VIF Inflation Error

Neural Network Models in R

How to choose the number of hidden layers and nodes in a feedforward neural network?

Introduction to Neural Networks for Java (second edition) by Jeff Heaton - Google Books

Cluster Analysis in R

Do we need to set training set and testing set for clustering?

K-means Clustering: Algorithm, Applications, Evaluation Methods, and Drawbacks

Types of Breast Cancer - American Society

Breast Cancer Stages - cancer.org

Breast Cancer: Stages - Cancer.Net

Classification: LDA and QDA Approaches

Discriminant Analysis Essentials in R